[llvm] 86eff6b - [MachineCombiner] Use default latency model when no detailed model available

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 20 09:28:36 PST 2023


Author: Philip Reames
Date: 2023-01-20T09:28:20-08:00
New Revision: 86eff6be686a1e41e13c08ebfc2db4dd4d58e7c6

URL: https://github.com/llvm/llvm-project/commit/86eff6be686a1e41e13c08ebfc2db4dd4d58e7c6
DIFF: https://github.com/llvm/llvm-project/commit/86eff6be686a1e41e13c08ebfc2db4dd4d58e7c6.diff

LOG: [MachineCombiner] Use default latency model when no detailed model available

This change adjusts the cost modeling used when the target does not have a schedule model with individual instruction latencies. After this change, we use the default latency information available from TargetSchedule. The default latency information essentially ends up treating most instructions as latency 1, with a few "expensive" ones getting a higher cost.

Previously, we unconditionally applied the first legal pattern - without any consideration of profitability. As a result, this change both prevents some patterns being applied, and changes which patterns are exercised. (i.e. previously the first pattern was applied, afterwards, maybe the second one is because the first wasn't profitable.)

The motivation here is two fold.

First, this brings the default behavior in line with the behavior when -mcpu or -mtune is specified. This improves test coverage, and generally makes it less likely we will have bad surprises when providing more information to the compiler.

Second, this enables some reassociation for ILP by default. Despite being unconditionally enabled, the prior code tended to "reassociate" repeatedly through an entire chain and simply moving the first operand to the end. The result was still a serial chain, just a different one. With this change, one of the intermediate transforms is unprofitable and we end up with a partially flattened tree.

Note that the resulting code diffs show significant room for improvement in the basic algorithm. I am intentionally excluding those from this patch.

For the test diffs, I don't seen any concerning regressions. I took a fairly close look at the RISCV ones, but only skimmed the x86 (particularly vector x86) changes.

Differential Revision: https://reviews.llvm.org/D141017

Added: 
    

Modified: 
    llvm/lib/CodeGen/MachineCombiner.cpp
    llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll
    llvm/test/CodeGen/RISCV/addcarry.ll
    llvm/test/CodeGen/RISCV/addimm-mulimm.ll
    llvm/test/CodeGen/RISCV/alu64.ll
    llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
    llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
    llvm/test/CodeGen/RISCV/copysign-casts.ll
    llvm/test/CodeGen/RISCV/div-by-constant.ll
    llvm/test/CodeGen/RISCV/div-pow2.ll
    llvm/test/CodeGen/RISCV/div.ll
    llvm/test/CodeGen/RISCV/fpclamptosat.ll
    llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll
    llvm/test/CodeGen/RISCV/iabs.ll
    llvm/test/CodeGen/RISCV/mul.ll
    llvm/test/CodeGen/RISCV/neg-abs.ll
    llvm/test/CodeGen/RISCV/rotl-rotr.ll
    llvm/test/CodeGen/RISCV/rv32zbb.ll
    llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
    llvm/test/CodeGen/RISCV/rv64zbb.ll
    llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
    llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
    llvm/test/CodeGen/RISCV/sadd_sat.ll
    llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
    llvm/test/CodeGen/RISCV/select-binop-identity.ll
    llvm/test/CodeGen/RISCV/shadowcallstack.ll
    llvm/test/CodeGen/RISCV/shifts.ll
    llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
    llvm/test/CodeGen/RISCV/srem-lkk.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/ssub_sat.ll
    llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
    llvm/test/CodeGen/RISCV/uadd_sat.ll
    llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
    llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/RISCV/unaligned-load-store.ll
    llvm/test/CodeGen/RISCV/urem-lkk.ll
    llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/usub_sat.ll
    llvm/test/CodeGen/RISCV/usub_sat_plus.ll
    llvm/test/CodeGen/RISCV/vararg.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/RISCV/xaluo.ll
    llvm/test/CodeGen/X86/abdu-vector-128.ll
    llvm/test/CodeGen/X86/add-sub-bool.ll
    llvm/test/CodeGen/X86/alias-static-alloca.ll
    llvm/test/CodeGen/X86/avx-vinsertf128.ll
    llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
    llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
    llvm/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
    llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
    llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
    llvm/test/CodeGen/X86/avx512fp16-mov.ll
    llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
    llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
    llvm/test/CodeGen/X86/bmi-out-of-order.ll
    llvm/test/CodeGen/X86/combine-add.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/divmod128.ll
    llvm/test/CodeGen/X86/fold-add.ll
    llvm/test/CodeGen/X86/fold-masked-merge.ll
    llvm/test/CodeGen/X86/fold-tied-op.ll
    llvm/test/CodeGen/X86/h-registers-1.ll
    llvm/test/CodeGen/X86/hipe-cc.ll
    llvm/test/CodeGen/X86/hipe-cc64.ll
    llvm/test/CodeGen/X86/horizontal-reduce-add.ll
    llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
    llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
    llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
    llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
    llvm/test/CodeGen/X86/horizontal-sum.ll
    llvm/test/CodeGen/X86/icmp-shift-opt.ll
    llvm/test/CodeGen/X86/imul.ll
    llvm/test/CodeGen/X86/lea-opt-cse4.ll
    llvm/test/CodeGen/X86/lea-opt2.ll
    llvm/test/CodeGen/X86/logic-shift.ll
    llvm/test/CodeGen/X86/machine-cp.ll
    llvm/test/CodeGen/X86/madd.ll
    llvm/test/CodeGen/X86/masked_gather.ll
    llvm/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
    llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
    llvm/test/CodeGen/X86/midpoint-int.ll
    llvm/test/CodeGen/X86/movmsk-cmp.ll
    llvm/test/CodeGen/X86/mul-constant-i64.ll
    llvm/test/CodeGen/X86/mul-constant-result.ll
    llvm/test/CodeGen/X86/mul-i1024.ll
    llvm/test/CodeGen/X86/mul-i256.ll
    llvm/test/CodeGen/X86/mul-i512.ll
    llvm/test/CodeGen/X86/mul128.ll
    llvm/test/CodeGen/X86/mul64.ll
    llvm/test/CodeGen/X86/muloti.ll
    llvm/test/CodeGen/X86/optimize-max-0.ll
    llvm/test/CodeGen/X86/popcnt.ll
    llvm/test/CodeGen/X86/pr34080-2.ll
    llvm/test/CodeGen/X86/ptest.ll
    llvm/test/CodeGen/X86/rev16.ll
    llvm/test/CodeGen/X86/rotate-multi.ll
    llvm/test/CodeGen/X86/sad.ll
    llvm/test/CodeGen/X86/setcc-wide-types.ll
    llvm/test/CodeGen/X86/shift-combine.ll
    llvm/test/CodeGen/X86/smul-with-overflow.ll
    llvm/test/CodeGen/X86/smul_fix.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/sse-regcall.ll
    llvm/test/CodeGen/X86/stack-clash-large.ll
    llvm/test/CodeGen/X86/statepoint-live-in.ll
    llvm/test/CodeGen/X86/statepoint-regs.ll
    llvm/test/CodeGen/X86/swift-return.ll
    llvm/test/CodeGen/X86/umul-with-overflow.ll
    llvm/test/CodeGen/X86/umul_fix.ll
    llvm/test/CodeGen/X86/umul_fix_sat.ll
    llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
    llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
    llvm/test/CodeGen/X86/v8i1-masks.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll
    llvm/test/CodeGen/X86/vector-fshr-128.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
    llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
    llvm/test/CodeGen/X86/vector-pcmp.ll
    llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
    llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
    llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
    llvm/test/CodeGen/X86/vector-reduce-add.ll
    llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
    llvm/test/CodeGen/X86/vector-reduce-and.ll
    llvm/test/CodeGen/X86/vector-reduce-mul.ll
    llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
    llvm/test/CodeGen/X86/vector-reduce-or.ll
    llvm/test/CodeGen/X86/vector-reduce-smax.ll
    llvm/test/CodeGen/X86/vector-reduce-smin.ll
    llvm/test/CodeGen/X86/vector-reduce-umax.ll
    llvm/test/CodeGen/X86/vector-reduce-umin.ll
    llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
    llvm/test/CodeGen/X86/vector-reduce-xor.ll
    llvm/test/CodeGen/X86/vector-trunc-math.ll
    llvm/test/CodeGen/X86/vector-trunc-packus.ll
    llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
    llvm/test/CodeGen/X86/win-smallparams.ll
    llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
    llvm/test/CodeGen/X86/x86-interleaved-access.ll
    llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index f3a3a64940921..974d570ece51f 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -209,9 +209,6 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                           MachineTraceMetrics::Trace BlockTrace) {
   SmallVector<unsigned, 16> InstrDepth;
-  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
-         "Missing machine model\n");
-
   // For each instruction in the new sequence compute the depth based on the
   // operands. Use the trace information when possible. For new operands which
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
@@ -267,9 +264,6 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
 /// \returns Latency of \p NewRoot
 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
                                      MachineTraceMetrics::Trace BlockTrace) {
-  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
-         "Missing machine model\n");
-
   // Check each definition in NewRoot and compute the latency
   unsigned NewRootLatency = 0;
 
@@ -379,8 +373,6 @@ bool MachineCombiner::improvesCriticalPathLen(
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     MachineCombinerPattern Pattern,
     bool SlackIsAccurate) {
-  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
-         "Missing machine model\n");
   // Get depth and latency of NewRoot and Root.
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
@@ -696,13 +688,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
         // Eagerly stop after the first pattern fires.
         Changed = true;
         break;
-      } else if (!TSchedModel.hasInstrSchedModelOrItineraries()) {
-        LLVM_DEBUG(dbgs() << "\t Replacing due to lack of schedule model\n");
-        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
-                                 RegUnits, TII, P, IncrementalUpdate);
-        // Eagerly stop after the first pattern fires.
-        Changed = true;
-        break;
       } else {
         // For big basic blocks, we only compute the full trace the first time
         // we hit this. We do not invalidate the trace, but instead update the

diff  --git a/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll b/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll
index 268e2a85f3fe1..5fd8261e27cc3 100644
--- a/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll
+++ b/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll
@@ -7,9 +7,9 @@
 define i64 @addc_adde(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: addc_adde:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a0, a3, a0
 ; RV32I-NEXT:    add a1, a1, a0
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
@@ -21,8 +21,8 @@ define i64 @subc_sube(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: subc_sube:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
-; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
   %1 = sub i64 %a, %b

diff  --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 6c300cbde795d..3a4163a8bb50f 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -19,11 +19,11 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32-NEXT:    sltu a5, a6, a5
 ; RISCV32-NEXT:    mulhu a6, a0, a3
 ; RISCV32-NEXT:    mulhu t0, a1, a2
-; RISCV32-NEXT:    add a5, a5, t0
-; RISCV32-NEXT:    add a5, a5, a7
-; RISCV32-NEXT:    mul a7, a1, a3
-; RISCV32-NEXT:    add a5, a5, a7
+; RISCV32-NEXT:    add a6, a6, t0
 ; RISCV32-NEXT:    add a5, a6, a5
+; RISCV32-NEXT:    add a5, a5, a7
+; RISCV32-NEXT:    mul a6, a1, a3
+; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    bgez a1, .LBB0_2
 ; RISCV32-NEXT:  # %bb.1:
 ; RISCV32-NEXT:    sub a5, a5, a2

diff  --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index 83b4f0392942d..4454af837004a 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -51,11 +51,11 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 29
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    addi a0, a2, 1073
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_accept_a3:
@@ -121,13 +121,13 @@ define i64 @add_mul_combine_accept_b3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 23
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 50
 ; RV32IMB-NEXT:    addi a0, a0, 1119
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_accept_b3:
@@ -187,13 +187,13 @@ define i64 @add_mul_combine_reject_a3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 29
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_a3:
@@ -253,13 +253,13 @@ define i64 @add_mul_combine_reject_c3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 73
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 18
 ; RV32IMB-NEXT:    addi a0, a0, -728
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_c3:
@@ -318,14 +318,14 @@ define i64 @add_mul_combine_reject_d3(i64 %x) {
 ; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a1, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 6
+; RV32IMB-NEXT:    add a1, a2, a1
 ; RV32IMB-NEXT:    sh1add a0, a0, a0
-; RV32IMB-NEXT:    slli a3, a0, 6
+; RV32IMB-NEXT:    slli a2, a0, 6
 ; RV32IMB-NEXT:    lui a0, 47
 ; RV32IMB-NEXT:    addi a0, a0, -512
-; RV32IMB-NEXT:    add a0, a3, a0
-; RV32IMB-NEXT:    sltu a3, a0, a3
-; RV32IMB-NEXT:    add a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_d3:
@@ -383,13 +383,13 @@ define i64 @add_mul_combine_reject_e3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 29
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_e3:
@@ -451,13 +451,13 @@ define i64 @add_mul_combine_reject_f3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 29
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -145
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_f3:
@@ -520,13 +520,13 @@ define i64 @add_mul_combine_reject_g3(i64 %x) {
 ; RV32IMB-NEXT:    li a2, 73
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, -882
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: add_mul_combine_reject_g3:
@@ -622,13 +622,13 @@ define i64 @mul3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: mul3000_add8990_c:
@@ -697,13 +697,13 @@ define i64 @mul3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    addi a1, a1, -1
 ; RV32IMB-NEXT:    ret
 ;
@@ -773,14 +773,14 @@ define i64 @mulneg3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a1, a0, a1
+; RV32IMB-NEXT:    sub a3, a3, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
-; RV32IMB-NEXT:    sub a1, a1, a2
-; RV32IMB-NEXT:    sub a1, a3, a1
+; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
 ; RV64IMB-LABEL: mulneg3000_add8990_c:
@@ -849,14 +849,14 @@ define i64 @mulneg3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a1, a0, a1
+; RV32IMB-NEXT:    sub a3, a3, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
-; RV32IMB-NEXT:    sub a1, a1, a2
-; RV32IMB-NEXT:    sub a1, a3, a1
+; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    addi a1, a1, -1
 ; RV32IMB-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index 5b7510ab982b7..9743c49f82d7b 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -172,9 +172,9 @@ define i64 @add(i64 %a, i64 %b) nounwind {
 ;
 ; RV32I-LABEL: add:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a0, a3, a0
 ; RV32I-NEXT:    add a1, a1, a0
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
@@ -191,8 +191,8 @@ define i64 @sub(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: sub:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
-; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
   %1 = sub i64 %a, %b

diff  --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 569bb8e34fe43..616b9fce88c90 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -64,7 +64,7 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    and a2, a0, a2
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -79,7 +79,7 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -109,7 +109,7 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a4, a1, a3
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a2, a4, a2
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a2, a1, a2
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a3
@@ -118,7 +118,7 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a3, a0, a3
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
@@ -137,8 +137,8 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    srliw a5, a5, 24
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    or a1, a5, a1
 ; RV64I-NEXT:    and a4, a0, a4
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    srliw a3, a0, 24
@@ -147,8 +147,8 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -401,7 +401,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    and a2, a0, a2
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    lui a2, 61681
@@ -437,7 +437,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    lui a2, 61681
@@ -545,7 +545,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a4, a1, a3
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a2, a4, a2
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    srli a2, a1, 4
 ; RV32I-NEXT:    lui a4, 61681
@@ -575,7 +575,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I-NEXT:    and a3, a0, a3
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    and a1, a1, a4
@@ -609,8 +609,8 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    srliw a5, a5, 24
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    or a1, a5, a1
 ; RV64I-NEXT:    and a4, a0, a4
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    srliw a3, a0, 24
@@ -619,14 +619,14 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    lui a3, %hi(.LCPI6_0)
-; RV64I-NEXT:    ld a3, %lo(.LCPI6_0)(a3)
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    lui a2, %hi(.LCPI6_0)
+; RV64I-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    lui a2, %hi(.LCPI6_1)
 ; RV64I-NEXT:    ld a2, %lo(.LCPI6_1)(a2)
 ; RV64I-NEXT:    slli a0, a0, 4

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
index 8c655f5cb7fa7..24e2d31707e9e 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
@@ -94,15 +94,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a2, 8(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 16(sp)
-; RV32I-FPELIM-NEXT:    lw a4, 20(sp)
-; RV32I-FPELIM-NEXT:    add a1, a7, a1
-; RV32I-FPELIM-NEXT:    add a1, a1, a2
-; RV32I-FPELIM-NEXT:    add a1, a1, a3
-; RV32I-FPELIM-NEXT:    add a1, a1, a4
+; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
+; RV32I-FPELIM-NEXT:    add a0, a0, a7
+; RV32I-FPELIM-NEXT:    add a1, a2, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
+; RV32I-FPELIM-NEXT:    add a3, a4, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a3
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -112,15 +112,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 16(s0)
-; RV32I-WITHFP-NEXT:    lw a4, 20(s0)
-; RV32I-WITHFP-NEXT:    add a1, a7, a1
-; RV32I-WITHFP-NEXT:    add a1, a1, a2
-; RV32I-WITHFP-NEXT:    add a1, a1, a3
-; RV32I-WITHFP-NEXT:    add a1, a1, a4
+; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
+; RV32I-WITHFP-NEXT:    add a0, a0, a7
+; RV32I-WITHFP-NEXT:    add a1, a2, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
+; RV32I-WITHFP-NEXT:    add a3, a4, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a3
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 3a5d2c4bc10ff..fbec0089f0d38 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -87,12 +87,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-FPELIM-NEXT:    andi a0, a0, 255
 ; RV32I-FPELIM-NEXT:    slli a1, a1, 16
 ; RV32I-FPELIM-NEXT:    srli a1, a1, 16
-; RV32I-FPELIM-NEXT:    add a1, a1, a2
-; RV32I-FPELIM-NEXT:    xor a2, a4, t1
-; RV32I-FPELIM-NEXT:    xor a3, a3, a7
-; RV32I-FPELIM-NEXT:    or a2, a3, a2
-; RV32I-FPELIM-NEXT:    seqz a2, a2
-; RV32I-FPELIM-NEXT:    add a1, a2, a1
+; RV32I-FPELIM-NEXT:    add a0, a0, a2
+; RV32I-FPELIM-NEXT:    add a0, a0, a1
+; RV32I-FPELIM-NEXT:    xor a1, a4, t1
+; RV32I-FPELIM-NEXT:    xor a2, a3, a7
+; RV32I-FPELIM-NEXT:    or a1, a2, a1
+; RV32I-FPELIM-NEXT:    seqz a1, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a5
 ; RV32I-FPELIM-NEXT:    add a0, a0, a6
 ; RV32I-FPELIM-NEXT:    add a0, a0, t0
@@ -110,12 +110,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-WITHFP-NEXT:    andi a0, a0, 255
 ; RV32I-WITHFP-NEXT:    slli a1, a1, 16
 ; RV32I-WITHFP-NEXT:    srli a1, a1, 16
-; RV32I-WITHFP-NEXT:    add a1, a1, a2
-; RV32I-WITHFP-NEXT:    xor a2, a4, t1
-; RV32I-WITHFP-NEXT:    xor a3, a3, a7
-; RV32I-WITHFP-NEXT:    or a2, a3, a2
-; RV32I-WITHFP-NEXT:    seqz a2, a2
-; RV32I-WITHFP-NEXT:    add a1, a2, a1
+; RV32I-WITHFP-NEXT:    add a0, a0, a2
+; RV32I-WITHFP-NEXT:    add a0, a0, a1
+; RV32I-WITHFP-NEXT:    xor a1, a4, t1
+; RV32I-WITHFP-NEXT:    xor a2, a3, a7
+; RV32I-WITHFP-NEXT:    or a1, a2, a1
+; RV32I-WITHFP-NEXT:    seqz a1, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a5
 ; RV32I-WITHFP-NEXT:    add a0, a0, a6
 ; RV32I-WITHFP-NEXT:    add a0, a0, t0
@@ -203,8 +203,8 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-NEXT:    or a4, a4, a5
 ; RV32I-FPELIM-NEXT:    xor a0, a0, a1
 ; RV32I-FPELIM-NEXT:    xor a2, a3, a2
-; RV32I-FPELIM-NEXT:    or a0, a0, a4
 ; RV32I-FPELIM-NEXT:    or a0, a2, a0
+; RV32I-FPELIM-NEXT:    or a0, a0, a4
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -227,8 +227,8 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    or a4, a4, a5
 ; RV32I-WITHFP-NEXT:    xor a0, a0, a1
 ; RV32I-WITHFP-NEXT:    xor a2, a3, a2
-; RV32I-WITHFP-NEXT:    or a0, a0, a4
 ; RV32I-WITHFP-NEXT:    or a0, a2, a0
+; RV32I-WITHFP-NEXT:    or a0, a0, a4
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -310,8 +310,8 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-NEXT:    or a3, a3, a4
 ; RV32I-FPELIM-NEXT:    xor a0, a7, a0
 ; RV32I-FPELIM-NEXT:    xor a1, a2, a1
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -335,8 +335,8 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    or a3, a3, a4
 ; RV32I-WITHFP-NEXT:    xor a0, a7, a0
 ; RV32I-WITHFP-NEXT:    xor a1, a2, a1
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -614,15 +614,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-FPELIM-LABEL: callee_aligned_stack:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    lw a0, 0(a2)
-; RV32I-FPELIM-NEXT:    lw a1, 0(sp)
-; RV32I-FPELIM-NEXT:    lw a2, 8(sp)
-; RV32I-FPELIM-NEXT:    lw a3, 16(sp)
-; RV32I-FPELIM-NEXT:    lw a4, 20(sp)
-; RV32I-FPELIM-NEXT:    add a1, a7, a1
-; RV32I-FPELIM-NEXT:    add a1, a1, a2
-; RV32I-FPELIM-NEXT:    add a1, a1, a3
-; RV32I-FPELIM-NEXT:    add a1, a1, a4
+; RV32I-FPELIM-NEXT:    lw a1, 8(sp)
+; RV32I-FPELIM-NEXT:    lw a2, 0(sp)
+; RV32I-FPELIM-NEXT:    lw a3, 20(sp)
+; RV32I-FPELIM-NEXT:    lw a4, 16(sp)
+; RV32I-FPELIM-NEXT:    add a0, a0, a7
+; RV32I-FPELIM-NEXT:    add a1, a2, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
+; RV32I-FPELIM-NEXT:    add a3, a4, a3
+; RV32I-FPELIM-NEXT:    add a0, a0, a3
 ; RV32I-FPELIM-NEXT:    ret
 ;
 ; RV32I-WITHFP-LABEL: callee_aligned_stack:
@@ -632,15 +632,15 @@ define i32 @callee_aligned_stack(i32 %a, i32 %b, fp128 %c, i32 %d, i32 %e, i64 %
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
 ; RV32I-WITHFP-NEXT:    lw a0, 0(a2)
-; RV32I-WITHFP-NEXT:    lw a1, 0(s0)
-; RV32I-WITHFP-NEXT:    lw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    lw a3, 16(s0)
-; RV32I-WITHFP-NEXT:    lw a4, 20(s0)
-; RV32I-WITHFP-NEXT:    add a1, a7, a1
-; RV32I-WITHFP-NEXT:    add a1, a1, a2
-; RV32I-WITHFP-NEXT:    add a1, a1, a3
-; RV32I-WITHFP-NEXT:    add a1, a1, a4
+; RV32I-WITHFP-NEXT:    lw a1, 8(s0)
+; RV32I-WITHFP-NEXT:    lw a2, 0(s0)
+; RV32I-WITHFP-NEXT:    lw a3, 20(s0)
+; RV32I-WITHFP-NEXT:    lw a4, 16(s0)
+; RV32I-WITHFP-NEXT:    add a0, a0, a7
+; RV32I-WITHFP-NEXT:    add a1, a2, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
+; RV32I-WITHFP-NEXT:    add a3, a4, a3
+; RV32I-WITHFP-NEXT:    add a0, a0, a3
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index 185155eec846a..adf3630d2a0c9 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -53,12 +53,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i128 %d, i32 %e, i32 %f,
 ; RV64I-NEXT:    andi a0, a0, 255
 ; RV64I-NEXT:    slli a1, a1, 48
 ; RV64I-NEXT:    srli a1, a1, 48
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    xor a2, a4, t1
-; RV64I-NEXT:    xor a3, a3, a7
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    seqz a2, a2
-; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    xor a1, a4, t1
+; RV64I-NEXT:    xor a2, a3, a7
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    add a0, a0, a5
 ; RV64I-NEXT:    add a0, a0, a6
 ; RV64I-NEXT:    add a0, a0, t0
@@ -119,8 +119,8 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    xor a2, a3, a2
-; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -174,8 +174,8 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    xor a0, a7, a0
 ; RV64I-NEXT:    xor a1, a2, a1
-; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %h, %j
@@ -328,15 +328,15 @@ define i64 @callee_aligned_stack(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i128 %f
 ; should only be 8-byte aligned
 ; RV64I-LABEL: callee_aligned_stack:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a0, 0(sp)
-; RV64I-NEXT:    ld a1, 16(sp)
-; RV64I-NEXT:    ld a2, 32(sp)
+; RV64I-NEXT:    ld a0, 32(sp)
+; RV64I-NEXT:    ld a1, 0(sp)
+; RV64I-NEXT:    ld a2, 16(sp)
 ; RV64I-NEXT:    ld a3, 40(sp)
-; RV64I-NEXT:    add a0, a7, a0
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a5, a5, a7
+; RV64I-NEXT:    add a1, a5, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    add a0, a0, a3
-; RV64I-NEXT:    add a0, a5, a0
 ; RV64I-NEXT:    ret
   %f_trunc = trunc i128 %f to i64
   %1 = add i64 %f_trunc, %g

diff  --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll
index d9bb35d9ad818..2cd05c7035d56 100644
--- a/llvm/test/CodeGen/RISCV/copysign-casts.ll
+++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll
@@ -447,8 +447,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
 ; RV32IF-NEXT:    srli a0, a0, 17
-; RV32IF-NEXT:    lui a2, 1048560
-; RV32IF-NEXT:    or a1, a1, a2
+; RV32IF-NEXT:    or a0, a0, a1
+; RV32IF-NEXT:    lui a1, 1048560
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    fmv.w.x fa0, a0
 ; RV32IF-NEXT:    ret
@@ -462,8 +462,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFD-NEXT:    srli a1, a1, 16
 ; RV32IFD-NEXT:    slli a0, a0, 17
 ; RV32IFD-NEXT:    srli a0, a0, 17
-; RV32IFD-NEXT:    lui a2, 1048560
-; RV32IFD-NEXT:    or a1, a1, a2
+; RV32IFD-NEXT:    or a0, a0, a1
+; RV32IFD-NEXT:    lui a1, 1048560
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    fmv.w.x fa0, a0
 ; RV32IFD-NEXT:    ret
@@ -477,8 +477,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IFD-NEXT:    srli a1, a1, 16
 ; RV64IFD-NEXT:    slli a0, a0, 49
 ; RV64IFD-NEXT:    srli a0, a0, 49
-; RV64IFD-NEXT:    lui a2, 1048560
-; RV64IFD-NEXT:    or a1, a1, a2
+; RV64IFD-NEXT:    or a0, a0, a1
+; RV64IFD-NEXT:    lui a1, 1048560
 ; RV64IFD-NEXT:    or a0, a0, a1
 ; RV64IFD-NEXT:    fmv.w.x fa0, a0
 ; RV64IFD-NEXT:    ret
@@ -581,8 +581,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
 ; RV32IF-NEXT:    srli a0, a0, 17
-; RV32IF-NEXT:    lui a2, 1048560
-; RV32IF-NEXT:    or a1, a1, a2
+; RV32IF-NEXT:    or a0, a0, a1
+; RV32IF-NEXT:    lui a1, 1048560
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    fmv.w.x fa0, a0
 ; RV32IF-NEXT:    ret
@@ -599,7 +599,7 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFD-NEXT:    slli a1, a1, 17
 ; RV32IFD-NEXT:    srli a1, a1, 17
 ; RV32IFD-NEXT:    lui a2, 1048560
-; RV32IFD-NEXT:    or a0, a0, a2
+; RV32IFD-NEXT:    or a1, a1, a2
 ; RV32IFD-NEXT:    or a0, a1, a0
 ; RV32IFD-NEXT:    fmv.w.x fa0, a0
 ; RV32IFD-NEXT:    addi sp, sp, 16
@@ -615,7 +615,7 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IFD-NEXT:    slli a0, a0, 63
 ; RV64IFD-NEXT:    srli a0, a0, 48
 ; RV64IFD-NEXT:    lui a2, 1048560
-; RV64IFD-NEXT:    or a0, a0, a2
+; RV64IFD-NEXT:    or a1, a1, a2
 ; RV64IFD-NEXT:    or a0, a1, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a0
 ; RV64IFD-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 6ba8e620b2911..da3045614a245 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -80,11 +80,11 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
 ; RV32-NEXT:    addi a3, a3, -820
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll
index d5a8036a5532a..254e675b4ed8b 100644
--- a/llvm/test/CodeGen/RISCV/div-pow2.ll
+++ b/llvm/test/CodeGen/RISCV/div-pow2.ll
@@ -213,8 +213,8 @@ define i64 @sdiv64_pow2_negative_2(i64 %a) {
 ; RV32I-NEXT:    neg a0, a3
 ; RV32I-NEXT:    snez a2, a3
 ; RV32I-NEXT:    srai a1, a1, 1
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2:
@@ -269,8 +269,8 @@ define i64 @sdiv64_pow2_negative_2048(i64 %a) {
 ; RV32I-NEXT:    neg a0, a3
 ; RV32I-NEXT:    snez a2, a3
 ; RV32I-NEXT:    srai a1, a1, 11
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2048:
@@ -326,8 +326,8 @@ define i64 @sdiv64_pow2_negative_4096(i64 %a) {
 ; RV32I-NEXT:    neg a0, a3
 ; RV32I-NEXT:    snez a2, a3
 ; RV32I-NEXT:    srai a1, a1, 12
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_4096:
@@ -383,8 +383,8 @@ define i64 @sdiv64_pow2_negative_65536(i64 %a) {
 ; RV32I-NEXT:    neg a0, a3
 ; RV32I-NEXT:    snez a2, a3
 ; RV32I-NEXT:    srai a1, a1, 16
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_65536:
@@ -404,11 +404,11 @@ define i64 @sdiv64_pow2_8589934592(i64 %a) {
 ; RV32I-LABEL: sdiv64_pow2_8589934592:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    srli a2, a1, 31
-; RV32I-NEXT:    srai a3, a1, 31
-; RV32I-NEXT:    add a3, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
+; RV32I-NEXT:    add a2, a1, a2
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
+; RV32I-NEXT:    add a1, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 1
 ; RV32I-NEXT:    srai a1, a1, 31
 ; RV32I-NEXT:    ret
@@ -429,16 +429,16 @@ define i64 @sdiv64_pow2_negative_8589934592(i64 %a) {
 ; RV32I-LABEL: sdiv64_pow2_negative_8589934592:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    srli a2, a1, 31
-; RV32I-NEXT:    srai a3, a1, 31
-; RV32I-NEXT:    add a3, a0, a3
-; RV32I-NEXT:    sltu a0, a3, a0
+; RV32I-NEXT:    add a2, a1, a2
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    sltu a0, a1, a0
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    srai a0, a0, 1
 ; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 9963d04d07b51..1466fb4df9ba3 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -195,11 +195,11 @@ define i64 @udiv64_constant(i64 %a) nounwind {
 ; RV32IM-NEXT:    addi a3, a3, -820
 ; RV32IM-NEXT:    mul a3, a5, a3
 ; RV32IM-NEXT:    mulhu a6, a5, a4
+; RV32IM-NEXT:    add a3, a6, a3
 ; RV32IM-NEXT:    sltu a0, a0, a2
 ; RV32IM-NEXT:    sub a1, a1, a0
-; RV32IM-NEXT:    mul a0, a1, a4
-; RV32IM-NEXT:    add a0, a3, a0
-; RV32IM-NEXT:    add a1, a6, a0
+; RV32IM-NEXT:    mul a1, a1, a4
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a5, a4
 ; RV32IM-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 6a5a456e6a439..7eb7e14353329 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3227,14 +3227,14 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    or a4, a1, a0
 ; RV32IF-NEXT:    snez a4, a4
 ; RV32IF-NEXT:    addi a4, a4, -1
+; RV32IF-NEXT:    and a3, a4, a3
 ; RV32IF-NEXT:    xori a0, a0, 1
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    seqz a0, a0
 ; RV32IF-NEXT:    addi a1, a0, -1
 ; RV32IF-NEXT:    and a0, a1, a3
-; RV32IF-NEXT:    and a0, a0, a4
+; RV32IF-NEXT:    and a2, a4, a2
 ; RV32IF-NEXT:    and a1, a1, a2
-; RV32IF-NEXT:    and a1, a1, a4
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    addi sp, sp, 32
 ; RV32IF-NEXT:    ret
@@ -3248,11 +3248,11 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV64-NEXT:    call __fixunsdfti at plt
 ; RV64-NEXT:    snez a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -3272,14 +3272,14 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    or a4, a1, a0
 ; RV32IFD-NEXT:    snez a4, a4
 ; RV32IFD-NEXT:    addi a4, a4, -1
+; RV32IFD-NEXT:    and a3, a4, a3
 ; RV32IFD-NEXT:    xori a0, a0, 1
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    seqz a0, a0
 ; RV32IFD-NEXT:    addi a1, a0, -1
 ; RV32IFD-NEXT:    and a0, a1, a3
-; RV32IFD-NEXT:    and a0, a0, a4
+; RV32IFD-NEXT:    and a2, a4, a2
 ; RV32IFD-NEXT:    and a1, a1, a2
-; RV32IFD-NEXT:    and a1, a1, a4
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
@@ -3326,14 +3326,14 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    seqz a6, a1
 ; RV32IF-NEXT:  .LBB47_7: # %entry
 ; RV32IF-NEXT:    neg a6, a6
+; RV32IF-NEXT:    and a3, a6, a3
 ; RV32IF-NEXT:    xori a1, a1, 1
 ; RV32IF-NEXT:    or a1, a1, a0
 ; RV32IF-NEXT:    seqz a1, a1
 ; RV32IF-NEXT:    addi a1, a1, -1
 ; RV32IF-NEXT:    and a3, a1, a3
-; RV32IF-NEXT:    and a3, a3, a6
+; RV32IF-NEXT:    and a4, a6, a4
 ; RV32IF-NEXT:    and a1, a1, a4
-; RV32IF-NEXT:    and a1, a1, a6
 ; RV32IF-NEXT:    neg a4, a5
 ; RV32IF-NEXT:    and a4, a4, a0
 ; RV32IF-NEXT:    mv a0, a3
@@ -3376,11 +3376,11 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV64-NEXT:  .LBB47_2: # %entry
 ; RV64-NEXT:    slti a3, a1, 1
 ; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    and a0, a3, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    beqz a2, .LBB47_4
 ; RV64-NEXT:  # %bb.3: # %entry
 ; RV64-NEXT:    sgtz a1, a2
@@ -3424,14 +3424,14 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    seqz a6, a1
 ; RV32IFD-NEXT:  .LBB47_7: # %entry
 ; RV32IFD-NEXT:    neg a6, a6
+; RV32IFD-NEXT:    and a3, a6, a3
 ; RV32IFD-NEXT:    xori a1, a1, 1
 ; RV32IFD-NEXT:    or a1, a1, a0
 ; RV32IFD-NEXT:    seqz a1, a1
 ; RV32IFD-NEXT:    addi a1, a1, -1
 ; RV32IFD-NEXT:    and a3, a1, a3
-; RV32IFD-NEXT:    and a3, a3, a6
+; RV32IFD-NEXT:    and a4, a6, a4
 ; RV32IFD-NEXT:    and a1, a1, a4
-; RV32IFD-NEXT:    and a1, a1, a6
 ; RV32IFD-NEXT:    neg a4, a5
 ; RV32IFD-NEXT:    and a4, a4, a0
 ; RV32IFD-NEXT:    mv a0, a3
@@ -3590,14 +3590,14 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV32-NEXT:    or a4, a1, a0
 ; RV32-NEXT:    snez a4, a4
 ; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a1, a0, -1
 ; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a2, a4, a2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a1, a1, a4
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3611,11 +3611,11 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV64-NEXT:    call __fixunssfti at plt
 ; RV64-NEXT:    snez a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -3660,14 +3660,14 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    seqz a6, a1
 ; RV32-NEXT:  .LBB50_7: # %entry
 ; RV32-NEXT:    neg a6, a6
+; RV32-NEXT:    and a3, a6, a3
 ; RV32-NEXT:    xori a1, a1, 1
 ; RV32-NEXT:    or a1, a1, a0
 ; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a3, a1, a3
-; RV32-NEXT:    and a3, a3, a6
+; RV32-NEXT:    and a4, a6, a4
 ; RV32-NEXT:    and a1, a1, a4
-; RV32-NEXT:    and a1, a1, a6
 ; RV32-NEXT:    neg a4, a5
 ; RV32-NEXT:    and a4, a4, a0
 ; RV32-NEXT:    mv a0, a3
@@ -3710,11 +3710,11 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV64-NEXT:  .LBB50_2: # %entry
 ; RV64-NEXT:    slti a3, a1, 1
 ; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    and a0, a3, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    beqz a2, .LBB50_4
 ; RV64-NEXT:  # %bb.3: # %entry
 ; RV64-NEXT:    sgtz a1, a2
@@ -3901,14 +3901,14 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV32-NEXT:    or a4, a1, a0
 ; RV32-NEXT:    snez a4, a4
 ; RV32-NEXT:    addi a4, a4, -1
+; RV32-NEXT:    and a3, a4, a3
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a1, a0, -1
 ; RV32-NEXT:    and a0, a1, a3
-; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    and a2, a4, a2
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a1, a1, a4
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 32
 ; RV32-NEXT:    ret
@@ -3924,11 +3924,11 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV64-NEXT:    call __fixunssfti at plt
 ; RV64-NEXT:    snez a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
@@ -3975,14 +3975,14 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    seqz a6, a1
 ; RV32-NEXT:  .LBB53_7: # %entry
 ; RV32-NEXT:    neg a6, a6
+; RV32-NEXT:    and a3, a6, a3
 ; RV32-NEXT:    xori a1, a1, 1
 ; RV32-NEXT:    or a1, a1, a0
 ; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a3, a1, a3
-; RV32-NEXT:    and a3, a3, a6
+; RV32-NEXT:    and a4, a6, a4
 ; RV32-NEXT:    and a1, a1, a4
-; RV32-NEXT:    and a1, a1, a6
 ; RV32-NEXT:    neg a4, a5
 ; RV32-NEXT:    and a4, a4, a0
 ; RV32-NEXT:    mv a0, a3
@@ -4027,11 +4027,11 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV64-NEXT:  .LBB53_2: # %entry
 ; RV64-NEXT:    slti a3, a1, 1
 ; RV64-NEXT:    neg a3, a3
+; RV64-NEXT:    and a0, a3, a0
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    seqz a1, a1
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    beqz a2, .LBB53_4
 ; RV64-NEXT:  # %bb.3: # %entry
 ; RV64-NEXT:    sgtz a1, a2

diff  --git a/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll
index 04a879e024b2c..77faf67002eac 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll
@@ -5574,17 +5574,17 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    call __fixunsdfti at plt
 ; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a0, a2, a0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    seqz a1, a1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    and a0, a0, a2
 ; CHECK-NOV-NEXT:    snez a1, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a1, a1, s0
 ; CHECK-NOV-NEXT:    addi s1, s1, -1
 ; CHECK-NOV-NEXT:    seqz a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -5622,18 +5622,18 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    call __fixunsdfti at plt
 ; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    addi s1, s1, -1
 ; CHECK-V-NEXT:    seqz a3, s1
 ; CHECK-V-NEXT:    addi a3, a3, -1
-; CHECK-V-NEXT:    and a3, a3, s0
 ; CHECK-V-NEXT:    and a2, a3, a2
 ; CHECK-V-NEXT:    snez a3, a1
 ; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a0, a3, a0
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    seqz a1, a1
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    and a0, a0, a3
 ; CHECK-V-NEXT:    sd a0, 24(sp)
 ; CHECK-V-NEXT:    sd a2, 32(sp)
 ; CHECK-V-NEXT:    addi a0, sp, 24
@@ -5695,10 +5695,10 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, s0
 ; CHECK-NOV-NEXT:    addi s1, s1, -1
 ; CHECK-NOV-NEXT:    seqz a5, s1
 ; CHECK-NOV-NEXT:    addi a5, a5, -1
-; CHECK-NOV-NEXT:    and a5, a5, s0
 ; CHECK-NOV-NEXT:    and a0, a5, a0
 ; CHECK-NOV-NEXT:    beqz a4, .LBB47_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
@@ -6064,17 +6064,17 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    call __fixunssfti at plt
 ; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a0, a2, a0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    seqz a1, a1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    and a0, a0, a2
 ; CHECK-NOV-NEXT:    snez a1, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a1, a1, s0
 ; CHECK-NOV-NEXT:    addi s1, s1, -1
 ; CHECK-NOV-NEXT:    seqz a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6112,18 +6112,18 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    call __fixunssfti at plt
 ; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    addi s1, s1, -1
 ; CHECK-V-NEXT:    seqz a3, s1
 ; CHECK-V-NEXT:    addi a3, a3, -1
-; CHECK-V-NEXT:    and a3, a3, s0
 ; CHECK-V-NEXT:    and a2, a3, a2
 ; CHECK-V-NEXT:    snez a3, a1
 ; CHECK-V-NEXT:    addi a3, a3, -1
+; CHECK-V-NEXT:    and a0, a3, a0
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    seqz a1, a1
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    and a0, a0, a3
 ; CHECK-V-NEXT:    sd a0, 24(sp)
 ; CHECK-V-NEXT:    sd a2, 32(sp)
 ; CHECK-V-NEXT:    addi a0, sp, 24
@@ -6185,10 +6185,10 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, s0
 ; CHECK-NOV-NEXT:    addi s1, s1, -1
 ; CHECK-NOV-NEXT:    seqz a5, s1
 ; CHECK-NOV-NEXT:    addi a5, a5, -1
-; CHECK-NOV-NEXT:    and a5, a5, s0
 ; CHECK-NOV-NEXT:    and a0, a5, a0
 ; CHECK-NOV-NEXT:    beqz a4, .LBB50_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
@@ -6549,17 +6549,17 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    call __fixunssfti at plt
 ; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a0, a2, a0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    seqz a1, a1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    and a0, a0, a2
 ; CHECK-NOV-NEXT:    snez a1, s2
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a1, a1, s1
 ; CHECK-NOV-NEXT:    addi s2, s2, -1
 ; CHECK-NOV-NEXT:    seqz a2, s2
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6591,17 +6591,17 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    call __fixunssfti at plt
 ; CHECK-V-NEXT:    snez a2, a1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    seqz a1, a1
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    and a0, a0, a2
 ; CHECK-V-NEXT:    snez a1, s2
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a1, a1, s1
 ; CHECK-V-NEXT:    addi s2, s2, -1
 ; CHECK-V-NEXT:    seqz a2, s2
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a2, a2, s1
 ; CHECK-V-NEXT:    and a1, a2, a1
 ; CHECK-V-NEXT:    sd a1, 8(sp)
 ; CHECK-V-NEXT:    sd a0, 0(sp)
@@ -6664,10 +6664,10 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
 ; CHECK-NOV-NEXT:    slti a0, s1, 1
 ; CHECK-NOV-NEXT:    neg a0, a0
+; CHECK-NOV-NEXT:    and a0, a0, s0
 ; CHECK-NOV-NEXT:    addi s1, s1, -1
 ; CHECK-NOV-NEXT:    seqz a5, s1
 ; CHECK-NOV-NEXT:    addi a5, a5, -1
-; CHECK-NOV-NEXT:    and a5, a5, s0
 ; CHECK-NOV-NEXT:    and a0, a5, a0
 ; CHECK-NOV-NEXT:    beqz a4, .LBB53_6
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
@@ -6727,10 +6727,10 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    addi a1, a1, -1
 ; CHECK-V-NEXT:    slti a0, s1, 1
 ; CHECK-V-NEXT:    neg a0, a0
+; CHECK-V-NEXT:    and a0, a0, s0
 ; CHECK-V-NEXT:    addi s1, s1, -1
 ; CHECK-V-NEXT:    seqz a5, s1
 ; CHECK-V-NEXT:    addi a5, a5, -1
-; CHECK-V-NEXT:    and a5, a5, s0
 ; CHECK-V-NEXT:    and a0, a5, a0
 ; CHECK-V-NEXT:    beqz a4, .LBB53_6
 ; CHECK-V-NEXT:  # %bb.5: # %entry

diff  --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index 7a1d57ecad3d8..e1a7131560566 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -225,8 +225,8 @@ define i64 @abs64(i64 %x) {
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    snez a2, a0
 ; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:  .LBB6_2:
 ; RV32I-NEXT:    ret
 ;
@@ -236,8 +236,8 @@ define i64 @abs64(i64 %x) {
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    snez a2, a0
 ; RV32ZBB-NEXT:    neg a0, a0
-; RV32ZBB-NEXT:    neg a2, a2
-; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a1, a1, a2
 ; RV32ZBB-NEXT:  .LBB6_2:
 ; RV32ZBB-NEXT:    ret
 ;
@@ -264,8 +264,8 @@ define i64 @select_abs64(i64 %x) {
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    snez a2, a0
 ; RV32I-NEXT:    neg a0, a0
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:  .LBB7_2:
 ; RV32I-NEXT:    ret
 ;
@@ -275,8 +275,8 @@ define i64 @select_abs64(i64 %x) {
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    snez a2, a0
 ; RV32ZBB-NEXT:    neg a0, a0
-; RV32ZBB-NEXT:    neg a2, a2
-; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a1, a1, a2
 ; RV32ZBB-NEXT:  .LBB7_2:
 ; RV32ZBB-NEXT:    ret
 ;
@@ -301,64 +301,64 @@ define i64 @select_abs64(i64 %x) {
 define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    snez a5, a3
 ; RV32I-NEXT:    mv a6, a5
-; RV32I-NEXT:    beqz a3, .LBB8_2
+; RV32I-NEXT:    beqz a2, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    snez a6, a3
+; RV32I-NEXT:    snez a6, a2
 ; RV32I-NEXT:  .LBB8_2:
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB8_4
+; RV32I-NEXT:    bgez a4, .LBB8_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    neg a7, a1
 ; RV32I-NEXT:    sltu t0, a7, a6
 ; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    add a1, a4, a1
 ; RV32I-NEXT:    add a1, a1, t0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a2
+; RV32I-NEXT:    neg a4, a1
 ; RV32I-NEXT:    sub a1, a7, a6
-; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a2, a2, a5
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a4
 ; RV32I-NEXT:  .LBB8_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a2, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    snez a5, a3
 ; RV32ZBB-NEXT:    mv a6, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB8_2
+; RV32ZBB-NEXT:    beqz a2, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    snez a6, a3
+; RV32ZBB-NEXT:    snez a6, a2
 ; RV32ZBB-NEXT:  .LBB8_2:
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB8_4
+; RV32ZBB-NEXT:    bgez a4, .LBB8_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    neg a7, a1
 ; RV32ZBB-NEXT:    sltu t0, a7, a6
 ; RV32ZBB-NEXT:    snez a1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
 ; RV32ZBB-NEXT:    add a1, a1, t0
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a2
+; RV32ZBB-NEXT:    neg a4, a1
 ; RV32ZBB-NEXT:    sub a1, a7, a6
-; RV32ZBB-NEXT:    add a3, a3, a5
+; RV32ZBB-NEXT:    add a2, a2, a5
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a4
 ; RV32ZBB-NEXT:  .LBB8_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: abs128:
@@ -367,8 +367,8 @@ define i128 @abs128(i128 %x) {
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    snez a2, a0
 ; RV64I-NEXT:    neg a0, a0
-; RV64I-NEXT:    neg a2, a2
-; RV64I-NEXT:    sub a1, a2, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    sub a1, a1, a2
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    ret
 ;
@@ -378,8 +378,8 @@ define i128 @abs128(i128 %x) {
 ; RV64ZBB-NEXT:  # %bb.1:
 ; RV64ZBB-NEXT:    snez a2, a0
 ; RV64ZBB-NEXT:    neg a0, a0
-; RV64ZBB-NEXT:    neg a2, a2
-; RV64ZBB-NEXT:    sub a1, a2, a1
+; RV64ZBB-NEXT:    neg a1, a1
+; RV64ZBB-NEXT:    sub a1, a1, a2
 ; RV64ZBB-NEXT:  .LBB8_2:
 ; RV64ZBB-NEXT:    ret
   %abs = tail call i128 @llvm.abs.i128(i128 %x, i1 true)
@@ -389,64 +389,64 @@ define i128 @abs128(i128 %x) {
 define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    snez a5, a4
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a2, 4(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    snez a5, a3
 ; RV32I-NEXT:    mv a6, a5
-; RV32I-NEXT:    beqz a3, .LBB9_2
+; RV32I-NEXT:    beqz a2, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    snez a6, a3
+; RV32I-NEXT:    snez a6, a2
 ; RV32I-NEXT:  .LBB9_2:
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB9_4
+; RV32I-NEXT:    bgez a4, .LBB9_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    neg a7, a1
 ; RV32I-NEXT:    sltu t0, a7, a6
 ; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    add a1, a4, a1
 ; RV32I-NEXT:    add a1, a1, t0
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, a2
+; RV32I-NEXT:    neg a4, a1
 ; RV32I-NEXT:    sub a1, a7, a6
-; RV32I-NEXT:    add a3, a3, a5
+; RV32I-NEXT:    add a2, a2, a5
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a4
 ; RV32I-NEXT:  .LBB9_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a1)
-; RV32ZBB-NEXT:    lw a3, 4(a1)
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    snez a5, a4
+; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a2, 4(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    snez a5, a3
 ; RV32ZBB-NEXT:    mv a6, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB9_2
+; RV32ZBB-NEXT:    beqz a2, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    snez a6, a3
+; RV32ZBB-NEXT:    snez a6, a2
 ; RV32ZBB-NEXT:  .LBB9_2:
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB9_4
+; RV32ZBB-NEXT:    bgez a4, .LBB9_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    neg a7, a1
 ; RV32ZBB-NEXT:    sltu t0, a7, a6
 ; RV32ZBB-NEXT:    snez a1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
 ; RV32ZBB-NEXT:    add a1, a1, t0
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, a2
+; RV32ZBB-NEXT:    neg a4, a1
 ; RV32ZBB-NEXT:    sub a1, a7, a6
-; RV32ZBB-NEXT:    add a3, a3, a5
+; RV32ZBB-NEXT:    add a2, a2, a5
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a4
 ; RV32ZBB-NEXT:  .LBB9_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a3, 0(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: select_abs128:
@@ -455,8 +455,8 @@ define i128 @select_abs128(i128 %x) {
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    snez a2, a0
 ; RV64I-NEXT:    neg a0, a0
-; RV64I-NEXT:    neg a2, a2
-; RV64I-NEXT:    sub a1, a2, a1
+; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    sub a1, a1, a2
 ; RV64I-NEXT:  .LBB9_2:
 ; RV64I-NEXT:    ret
 ;
@@ -466,8 +466,8 @@ define i128 @select_abs128(i128 %x) {
 ; RV64ZBB-NEXT:  # %bb.1:
 ; RV64ZBB-NEXT:    snez a2, a0
 ; RV64ZBB-NEXT:    neg a0, a0
-; RV64ZBB-NEXT:    neg a2, a2
-; RV64ZBB-NEXT:    sub a1, a2, a1
+; RV64ZBB-NEXT:    neg a1, a1
+; RV64ZBB-NEXT:    sub a1, a1, a2
 ; RV64ZBB-NEXT:  .LBB9_2:
 ; RV64ZBB-NEXT:    ret
   %1 = icmp slt i128 %x, 0

diff  --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 35c8e765e5e0c..31843f5f3029f 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -142,9 +142,9 @@ define i64 @mul64(i64 %a, i64 %b) nounwind {
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    mul a3, a0, a3
 ; RV32IM-NEXT:    mulhu a4, a0, a2
+; RV32IM-NEXT:    add a3, a4, a3
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    add a1, a4, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
@@ -169,8 +169,8 @@ define i64 @mul64_constant(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 30
 ; RV32I-NEXT:    slli a4, a1, 2
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a1, a0, a3
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -179,8 +179,8 @@ define i64 @mul64_constant(i64 %a) nounwind {
 ; RV32IM-NEXT:    li a2, 5
 ; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 2
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    slli a2, a0, 2
 ; RV32IM-NEXT:    add a0, a2, a0
 ; RV32IM-NEXT:    ret
@@ -256,8 +256,8 @@ define i32 @mulhs_positive_constant(i32 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 30
 ; RV32I-NEXT:    slli a3, a1, 2
 ; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mulhs_positive_constant:
@@ -298,11 +298,11 @@ define i32 @mulhs_negative_constant(i32 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 30
 ; RV32I-NEXT:    slli a4, a1, 2
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    snez a1, a3
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mulhs_negative_constant:
@@ -530,8 +530,8 @@ define i64 @muli64_p65(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    add a1, a0, a1
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a1, a0, a3
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -540,8 +540,8 @@ define i64 @muli64_p65(i64 %a) nounwind {
 ; RV32IM-NEXT:    li a2, 65
 ; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    slli a2, a0, 6
 ; RV32IM-NEXT:    add a0, a2, a0
 ; RV32IM-NEXT:    ret
@@ -569,8 +569,8 @@ define i64 @muli64_p63(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a4, a0, 26
 ; RV32I-NEXT:    slli a5, a1, 6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    sub a1, a4, a1
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -579,8 +579,8 @@ define i64 @muli64_p63(i64 %a) nounwind {
 ; RV32IM-NEXT:    li a2, 63
 ; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
-; RV32IM-NEXT:    sub a1, a2, a1
-; RV32IM-NEXT:    add a1, a1, a3
+; RV32IM-NEXT:    sub a1, a3, a1
+; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    slli a2, a0, 6
 ; RV32IM-NEXT:    sub a0, a2, a0
 ; RV32IM-NEXT:    ret
@@ -668,7 +668,7 @@ define i64 @muli64_m63(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a4, a0, 26
 ; RV32I-NEXT:    slli a5, a1, 6
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    add a3, a4, a3
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
@@ -679,8 +679,8 @@ define i64 @muli64_m63(i64 %a) nounwind {
 ; RV32IM-NEXT:    sub a1, a1, a2
 ; RV32IM-NEXT:    li a2, -63
 ; RV32IM-NEXT:    mulhu a2, a0, a2
-; RV32IM-NEXT:    sub a1, a0, a1
-; RV32IM-NEXT:    sub a1, a2, a1
+; RV32IM-NEXT:    sub a2, a2, a0
+; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    slli a2, a0, 6
 ; RV32IM-NEXT:    sub a0, a0, a2
 ; RV32IM-NEXT:    ret
@@ -709,9 +709,9 @@ define i64 @muli64_m65(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a0, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
 ; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    snez a1, a3
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    sub a1, a1, a0
 ; RV32I-NEXT:    neg a0, a3
@@ -723,7 +723,7 @@ define i64 @muli64_m65(i64 %a) nounwind {
 ; RV32IM-NEXT:    add a1, a2, a1
 ; RV32IM-NEXT:    li a2, -65
 ; RV32IM-NEXT:    mulhu a2, a0, a2
-; RV32IM-NEXT:    add a1, a0, a1
+; RV32IM-NEXT:    sub a2, a2, a0
 ; RV32IM-NEXT:    sub a1, a2, a1
 ; RV32IM-NEXT:    slli a2, a0, 6
 ; RV32IM-NEXT:    neg a0, a0
@@ -949,11 +949,11 @@ define i64 @muli64_p4352(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    slli a4, a0, 12
-; RV32I-NEXT:    add a0, a4, a3
-; RV32I-NEXT:    sltu a3, a0, a4
-; RV32I-NEXT:    add a2, a2, a3
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a0, 12
+; RV32I-NEXT:    add a0, a3, a2
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -993,12 +993,12 @@ define i64 @muli64_p3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 8
-; RV32I-NEXT:    slli a0, a0, 12
-; RV32I-NEXT:    sltu a4, a0, a3
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a0, a0, 12
+; RV32I-NEXT:    sltu a3, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p3840:
@@ -1047,8 +1047,8 @@ define i64 @muli64_m4352(i64 %a) nounwind {
 ; RV32IM-NEXT:    slli a2, a2, 8
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    mulhu a3, a0, a2
-; RV32IM-NEXT:    sub a1, a0, a1
-; RV32IM-NEXT:    sub a1, a3, a1
+; RV32IM-NEXT:    sub a3, a3, a0
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
@@ -1077,12 +1077,12 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    slli a3, a0, 12
-; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    sltu a4, a0, a3
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    slli a2, a0, 12
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    sltu a3, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_m3840:
@@ -1091,8 +1091,8 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 ; RV32IM-NEXT:    slli a2, a2, 8
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    mulhu a3, a0, a2
-; RV32IM-NEXT:    sub a1, a0, a1
-; RV32IM-NEXT:    sub a1, a3, a1
+; RV32IM-NEXT:    sub a3, a3, a0
+; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
@@ -1126,14 +1126,14 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-NEXT:    srli a2, a4, 24
 ; RV32I-NEXT:    slli a7, a3, 8
 ; RV32I-NEXT:    or a2, a7, a2
-; RV32I-NEXT:    sltu a7, a2, a1
-; RV32I-NEXT:    srli t0, a3, 20
+; RV32I-NEXT:    sltu t0, a2, a1
+; RV32I-NEXT:    srli a7, a3, 20
 ; RV32I-NEXT:    slli t1, a5, 12
-; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or a7, t1, a7
 ; RV32I-NEXT:    srli a3, a3, 24
 ; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or t1, a5, a3
-; RV32I-NEXT:    add t0, t0, a7
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    sub t1, a3, a7
 ; RV32I-NEXT:    srli a3, a6, 20
 ; RV32I-NEXT:    slli a5, a4, 12
 ; RV32I-NEXT:    or a3, a5, a3
@@ -1195,19 +1195,19 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
 ; RV32IM-NEXT:    mulhu t1, a1, t2
+; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
 ; RV32IM-NEXT:    sltu t0, t5, t0
 ; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    mulhu t2, a3, a5
-; RV32IM-NEXT:    sub a3, a3, a2
-; RV32IM-NEXT:    sub a2, t2, a3
+; RV32IM-NEXT:    mulhu t1, a3, a5
+; RV32IM-NEXT:    sub a3, t1, a3
+; RV32IM-NEXT:    add a2, a3, a2
 ; RV32IM-NEXT:    add a1, a4, a1
-; RV32IM-NEXT:    sub a1, a1, a2
-; RV32IM-NEXT:    sub a1, a1, t0
 ; RV32IM-NEXT:    sub a1, t3, a1
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    add a1, a1, t0
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    add a1, t1, a1
 ; RV32IM-NEXT:    mul a2, a4, a5
 ; RV32IM-NEXT:    sw a2, 0(a0)
 ; RV32IM-NEXT:    sw a6, 4(a0)
@@ -1226,12 +1226,12 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    slli a3, a0, 12
-; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    sltu a4, a0, a3
-; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    slli a2, a0, 12
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    sltu a3, a0, a2
+; RV64I-NEXT:    sub a1, a1, a3
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli128_m3840:
@@ -1240,8 +1240,8 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV64IM-NEXT:    slli a2, a2, 8
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    mulhu a3, a0, a2
-; RV64IM-NEXT:    sub a1, a0, a1
-; RV64IM-NEXT:    sub a1, a3, a1
+; RV64IM-NEXT:    sub a3, a3, a0
+; RV64IM-NEXT:    add a1, a3, a1
 ; RV64IM-NEXT:    mul a0, a0, a2
 ; RV64IM-NEXT:    ret
   %1 = mul i128 %a, -3840
@@ -1274,9 +1274,9 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-NEXT:    srli a7, a7, 26
 ; RV32I-NEXT:    slli t4, a5, 6
 ; RV32I-NEXT:    or a7, t4, a7
-; RV32I-NEXT:    add a7, a7, t1
-; RV32I-NEXT:    add a7, a7, t3
 ; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a5, a5, t3
 ; RV32I-NEXT:    sub a7, t2, t0
 ; RV32I-NEXT:    sub a3, a3, a6
 ; RV32I-NEXT:    sub a3, a3, a4
@@ -1322,21 +1322,21 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    sltu t4, t4, s1
 ; RV32IM-NEXT:    sltu a7, t1, a7
 ; RV32IM-NEXT:    mulhu t1, a4, t2
+; RV32IM-NEXT:    add a7, t1, a7
 ; RV32IM-NEXT:    add a7, a7, t4
-; RV32IM-NEXT:    slli t2, a2, 6
-; RV32IM-NEXT:    sub a2, a2, t2
+; RV32IM-NEXT:    slli t1, a2, 6
+; RV32IM-NEXT:    sub a2, a2, t1
 ; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    sub a1, a1, a2
 ; RV32IM-NEXT:    sub a5, a5, a1
+; RV32IM-NEXT:    add a2, a5, a2
 ; RV32IM-NEXT:    add a4, a3, a4
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    neg a1, t5
-; RV32IM-NEXT:    sltu a1, a1, t0
-; RV32IM-NEXT:    sub a4, a4, a1
 ; RV32IM-NEXT:    sub a1, t3, a4
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    neg a2, t5
+; RV32IM-NEXT:    sltu a2, a2, t0
+; RV32IM-NEXT:    add a1, a1, a2
 ; RV32IM-NEXT:    add a1, a7, a1
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    add a1, t1, a1
 ; RV32IM-NEXT:    slli a2, a3, 6
 ; RV32IM-NEXT:    sub a3, a3, a2
 ; RV32IM-NEXT:    sw a3, 0(a0)
@@ -1355,7 +1355,7 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV64I-NEXT:    srli a4, a0, 58
 ; RV64I-NEXT:    slli a5, a1, 6
 ; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    add a3, a4, a3
+; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
@@ -1366,8 +1366,8 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV64IM-NEXT:    sub a1, a1, a2
 ; RV64IM-NEXT:    li a2, -63
 ; RV64IM-NEXT:    mulhu a2, a0, a2
-; RV64IM-NEXT:    sub a1, a0, a1
-; RV64IM-NEXT:    sub a1, a2, a1
+; RV64IM-NEXT:    sub a2, a2, a0
+; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    slli a2, a0, 6
 ; RV64IM-NEXT:    sub a0, a0, a2
 ; RV64IM-NEXT:    ret
@@ -1441,13 +1441,13 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sltu a3, a2, s9
 ; RV32I-NEXT:    sltu a4, s9, s5
 ; RV32I-NEXT:    sltu a5, s8, s7
+; RV32I-NEXT:    add a5, s6, a5
 ; RV32I-NEXT:    add a4, a5, a4
+; RV32I-NEXT:    add a1, a1, s3
 ; RV32I-NEXT:    sltu a0, s2, a0
-; RV32I-NEXT:    add a0, s3, a0
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a3
-; RV32I-NEXT:    add a1, s6, a0
+; RV32I-NEXT:    add a1, a0, a3
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1486,14 +1486,14 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32IM-NEXT:    sltu a7, t0, a7
 ; RV32IM-NEXT:    sltu a5, a5, a6
 ; RV32IM-NEXT:    mulhu a3, a1, a3
-; RV32IM-NEXT:    add a5, a5, a7
+; RV32IM-NEXT:    add a3, a3, a5
+; RV32IM-NEXT:    add a3, a3, a7
 ; RV32IM-NEXT:    mul a1, a4, a1
 ; RV32IM-NEXT:    mulhu a0, a4, a0
-; RV32IM-NEXT:    add a1, a1, t1
 ; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    add a0, a5, a0
-; RV32IM-NEXT:    add a0, a0, t2
-; RV32IM-NEXT:    add a1, a3, a0
+; RV32IM-NEXT:    add a0, a0, t1
+; RV32IM-NEXT:    add a0, a3, a0
+; RV32IM-NEXT:    add a1, a0, t2
 ; RV32IM-NEXT:    mv a0, a2
 ; RV32IM-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index ebf4dfd390349..06be6cbd96410 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -83,8 +83,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -94,8 +94,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32ZBB-NEXT:    xor a0, a0, a2
 ; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
-; RV32ZBB-NEXT:    add a1, a1, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -123,8 +123,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
-; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -134,8 +134,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32ZBB-NEXT:    xor a0, a0, a2
 ; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
-; RV32ZBB-NEXT:    add a1, a1, a3
 ; RV32ZBB-NEXT:    sub a1, a2, a1
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -204,14 +204,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32I-NEXT:    bgez a1, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    snez a3, a0
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    sub a1, a3, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:  .LBB5_2:
 ; RV32I-NEXT:    sw a0, 0(a2)
 ; RV32I-NEXT:    snez a3, a0
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    neg a4, a1
+; RV32I-NEXT:    sub a3, a4, a3
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    sw a1, 4(a2)
 ; RV32I-NEXT:    mv a1, a3
@@ -222,14 +222,14 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) {
 ; RV32ZBB-NEXT:    bgez a1, .LBB5_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    snez a3, a0
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    sub a1, a3, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:  .LBB5_2:
 ; RV32ZBB-NEXT:    sw a0, 0(a2)
 ; RV32ZBB-NEXT:    snez a3, a0
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    sub a3, a3, a1
+; RV32ZBB-NEXT:    neg a4, a1
+; RV32ZBB-NEXT:    sub a3, a4, a3
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    sw a1, 4(a2)
 ; RV32ZBB-NEXT:    mv a1, a3

diff  --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index 0e6c30c42567d..f735d21775114 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -1076,8 +1076,8 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltu a1, a0, a1
+; RV32I-NEXT:    add a3, a5, a3
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    add a1, a5, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_shared:
@@ -1131,8 +1131,8 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-NEXT:    and a0, a0, a2
 ; RV32ZBB-NEXT:    add a0, a1, a0
 ; RV32ZBB-NEXT:    sltu a1, a0, a1
+; RV32ZBB-NEXT:    add a3, a5, a3
 ; RV32ZBB-NEXT:    add a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a5, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_shared:
@@ -1232,7 +1232,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    add a0, a6, a0
 ; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a2, a3, a2
+; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -1286,7 +1286,7 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-NEXT:    and a0, a0, a2
 ; RV32ZBB-NEXT:    add a0, a6, a0
 ; RV32ZBB-NEXT:    sltu a2, a0, a6
-; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    add a1, a1, a3
 ; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1369,14 +1369,14 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    srl t0, t0, a1
 ; RV32I-NEXT:    sll t1, a0, a4
 ; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    srl a6, a0, a1
+; RV32I-NEXT:    srl t2, a0, a1
 ; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    bnez a5, .LBB21_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB21_6:
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a7, t1, t2
 ; RV32I-NEXT:    sll t0, a0, a4
 ; RV32I-NEXT:    bnez a5, .LBB21_8
 ; RV32I-NEXT:  # %bb.7:
@@ -1388,11 +1388,11 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    sll a2, a2, a4
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    or a2, a2, a0
-; RV32I-NEXT:    add a0, a7, a3
-; RV32I-NEXT:    sltu a1, a0, a7
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    add a1, a6, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    add a1, a7, a0
+; RV32I-NEXT:    add a0, a6, a3
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_multiple:
@@ -1426,14 +1426,14 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    srl t0, t0, a1
 ; RV32ZBB-NEXT:    sll t1, a0, a4
 ; RV32ZBB-NEXT:    srli a0, a6, 1
-; RV32ZBB-NEXT:    srl a6, a0, a1
+; RV32ZBB-NEXT:    srl t2, a0, a1
 ; RV32ZBB-NEXT:    mv a0, a3
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a0, a2
 ; RV32ZBB-NEXT:  .LBB21_6:
-; RV32ZBB-NEXT:    or a7, a7, t0
-; RV32ZBB-NEXT:    or a6, t1, a6
+; RV32ZBB-NEXT:    or a6, a7, t0
+; RV32ZBB-NEXT:    or a7, t1, t2
 ; RV32ZBB-NEXT:    sll t0, a0, a4
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_8
 ; RV32ZBB-NEXT:  # %bb.7:
@@ -1445,11 +1445,11 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    sll a2, a2, a4
 ; RV32ZBB-NEXT:    srli a0, a0, 1
 ; RV32ZBB-NEXT:    srl a0, a0, a1
-; RV32ZBB-NEXT:    or a2, a2, a0
-; RV32ZBB-NEXT:    add a0, a7, a3
-; RV32ZBB-NEXT:    sltu a1, a0, a7
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    add a1, a6, a1
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a7, a0
+; RV32ZBB-NEXT:    add a0, a6, a3
+; RV32ZBB-NEXT:    sltu a2, a0, a6
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_multiple:
@@ -1527,16 +1527,16 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    slli t0, a1, 1
 ; RV32I-NEXT:    not a0, a4
 ; RV32I-NEXT:    sll t0, t0, a0
-; RV32I-NEXT:    srl a1, a1, a4
+; RV32I-NEXT:    srl t1, a1, a4
 ; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    sll t1, a6, a0
+; RV32I-NEXT:    sll t2, a6, a0
 ; RV32I-NEXT:    mv a6, a2
 ; RV32I-NEXT:    beqz a5, .LBB23_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a6, a3
 ; RV32I-NEXT:  .LBB23_6:
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    or a1, t1, a1
+; RV32I-NEXT:    or a1, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    srl t0, a6, a4
 ; RV32I-NEXT:    beqz a5, .LBB23_8
 ; RV32I-NEXT:  # %bb.7:
@@ -1548,11 +1548,11 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    srl a3, a3, a4
 ; RV32I-NEXT:    slli a6, a6, 1
 ; RV32I-NEXT:    sll a0, a6, a0
-; RV32I-NEXT:    or a3, a0, a3
-; RV32I-NEXT:    add a0, a7, a2
-; RV32I-NEXT:    sltu a2, a0, a7
-; RV32I-NEXT:    add a2, a3, a2
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    add a7, a7, a0
+; RV32I-NEXT:    add a0, a1, a2
+; RV32I-NEXT:    sltu a1, a0, a1
+; RV32I-NEXT:    add a1, a7, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotr_64_mask_multiple:
@@ -1583,16 +1583,16 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    slli t0, a1, 1
 ; RV32ZBB-NEXT:    not a0, a4
 ; RV32ZBB-NEXT:    sll t0, t0, a0
-; RV32ZBB-NEXT:    srl a1, a1, a4
+; RV32ZBB-NEXT:    srl t1, a1, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
-; RV32ZBB-NEXT:    sll t1, a6, a0
+; RV32ZBB-NEXT:    sll t2, a6, a0
 ; RV32ZBB-NEXT:    mv a6, a2
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a6, a3
 ; RV32ZBB-NEXT:  .LBB23_6:
-; RV32ZBB-NEXT:    or a7, t0, a7
-; RV32ZBB-NEXT:    or a1, t1, a1
+; RV32ZBB-NEXT:    or a1, t0, a7
+; RV32ZBB-NEXT:    or a7, t2, t1
 ; RV32ZBB-NEXT:    srl t0, a6, a4
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_8
 ; RV32ZBB-NEXT:  # %bb.7:
@@ -1604,11 +1604,11 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    srl a3, a3, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
 ; RV32ZBB-NEXT:    sll a0, a6, a0
-; RV32ZBB-NEXT:    or a3, a0, a3
-; RV32ZBB-NEXT:    add a0, a7, a2
-; RV32ZBB-NEXT:    sltu a2, a0, a7
-; RV32ZBB-NEXT:    add a2, a3, a2
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    or a0, a0, a3
+; RV32ZBB-NEXT:    add a7, a7, a0
+; RV32ZBB-NEXT:    add a0, a1, a2
+; RV32ZBB-NEXT:    sltu a1, a0, a1
+; RV32ZBB-NEXT:    add a1, a7, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotr_64_mask_multiple:

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index fb0de388a7554..0318cbaa15e47 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -720,8 +720,8 @@ define i64 @abs_i64(i64 %x) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    snez a2, a0
 ; CHECK-NEXT:    neg a0, a0
-; CHECK-NEXT:    neg a2, a2
-; CHECK-NEXT:    sub a1, a2, a1
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    sub a1, a1, a2
 ; CHECK-NEXT:  .LBB19_2:
 ; CHECK-NEXT:    ret
   %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
@@ -774,7 +774,7 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    and a2, a0, a2
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -800,7 +800,7 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I-NEXT:    and a4, a1, a3
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a2, a4, a2
+; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a2, a1, a2
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    and a1, a1, a3
@@ -809,7 +809,7 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I-NEXT:    and a3, a0, a3
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index b9c1c7e91a6e9..c616fbcd37425 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -16,8 +16,8 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    slli a2, a2, 32
 ; CHECK-NEXT:    mulhu a1, a2, a1
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    addw a0, a3, a0
+; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    addw a0, a0, a1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    li a0, 0
@@ -61,8 +61,8 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    mulhu a1, a3, a1
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    subw a0, a2, a0
+; CHECK-NEXT:    subw a0, a0, a1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    li a0, 0

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 51417f84f7f36..22ea150fcde0e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -957,7 +957,7 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -983,7 +983,7 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I-NEXT:    and a3, a0, a3
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    sw a0, 0(a1)
 ; RV64I-NEXT:    ret
@@ -1016,8 +1016,8 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    srliw a5, a5, 24
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    or a1, a5, a1
 ; RV64I-NEXT:    and a4, a0, a4
 ; RV64I-NEXT:    slli a4, a4, 24
 ; RV64I-NEXT:    srliw a3, a0, 24
@@ -1026,8 +1026,8 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I-NEXT:    and a2, a0, a2
 ; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index b1d52a6154f99..d34c10798f482 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -36,8 +36,8 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    neg a2, a2
 ; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a6, a2
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a1, a1, a6
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    ret
 ;
@@ -72,8 +72,8 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    negw a2, a2
 ; RV64-NEXT:    and a2, a2, a4
 ; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a6, a2
-; RV64-NEXT:    add a0, a2, a0
+; RV64-NEXT:    add a1, a1, a6
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ret
   %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index d3c3765bc669d..8cad9c26c488b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -30,20 +30,20 @@ define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    lw a3, 12(a0)
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    lw a6, 0(a1)
-; RV32-NEXT:    lw a7, 8(a1)
-; RV32-NEXT:    lw t0, 4(a1)
+; RV32-NEXT:    lw a6, 4(a1)
+; RV32-NEXT:    lw a7, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
 ; RV32-NEXT:    lw a1, 12(a1)
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a4, a6, a4
-; RV32-NEXT:    add a4, t0, a4
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a7, a4, a7
+; RV32-NEXT:    sltu a4, a7, a4
 ; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    add a7, a2, a7
-; RV32-NEXT:    sltu a2, a7, a2
-; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    sw a7, 8(a0)
-; RV32-NEXT:    sw a6, 0(a0)
+; RV32-NEXT:    add t0, a2, t0
+; RV32-NEXT:    sltu a2, t0, a2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    sw t0, 8(a0)
+; RV32-NEXT:    sw a7, 0(a0)
 ; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    sw a4, 4(a0)
 ; RV32-NEXT:    ret
@@ -88,15 +88,15 @@ define void @add_v1i64(ptr %x, ptr %y) {
 ; RV32-LABEL: add_v1i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lw a2, 0(a0)
-; RV32-NEXT:    lw a3, 0(a1)
-; RV32-NEXT:    lw a1, 4(a1)
-; RV32-NEXT:    lw a4, 4(a0)
-; RV32-NEXT:    add a3, a2, a3
-; RV32-NEXT:    sltu a2, a3, a2
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    add a1, a4, a1
-; RV32-NEXT:    sw a3, 0(a0)
-; RV32-NEXT:    sw a1, 4(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sltu a2, a1, a2
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    sw a1, 0(a0)
+; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v1i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index 6094574b1f031..5b5f27cb15844 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -244,17 +244,17 @@ define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1>
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vcpop.m a2, v11, v0.t
 ; CHECK-NEXT:    seqz a2, a2
-; CHECK-NEXT:    addi a3, a1, -128
-; CHECK-NEXT:    sltu a1, a1, a3
+; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    addi a2, a1, -128
+; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a1, a1, a3
+; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vcpop.m a1, v8, v0.t
 ; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    ret
   %r = call i1 @llvm.vp.reduce.and.v256i1(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %evl)
   ret i1 %r

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 82c40154a0a74..90b2dd9f03830 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -425,8 +425,8 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-NEXT:    or a3, a3, a4
 ; RV32-NEXT:    slli a5, a5, 16
 ; RV32-NEXT:    slli a6, a6, 24
-; RV32-NEXT:    or a3, a5, a3
-; RV32-NEXT:    or a3, a6, a3
+; RV32-NEXT:    or a4, a6, a5
+; RV32-NEXT:    or a3, a4, a3
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    andi a2, a2, 2
@@ -446,7 +446,7 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV32-NEXT:    or a2, a2, a3
 ; RV32-NEXT:    slli a4, a4, 16
 ; RV32-NEXT:    slli a0, a0, 24
-; RV32-NEXT:    or a2, a4, a2
+; RV32-NEXT:    or a0, a0, a4
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    vmv.s.x v9, a0
 ; RV32-NEXT:    vslideup.vi v8, v9, 1
@@ -471,8 +471,8 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-NEXT:    or a3, a3, a4
 ; RV64-NEXT:    slli a5, a5, 16
 ; RV64-NEXT:    slli a6, a6, 24
-; RV64-NEXT:    or a3, a5, a3
-; RV64-NEXT:    or a3, a6, a3
+; RV64-NEXT:    or a4, a6, a5
+; RV64-NEXT:    or a3, a4, a3
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a3
 ; RV64-NEXT:    andi a2, a2, 2
@@ -492,7 +492,7 @@ define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwi
 ; RV64-NEXT:    or a2, a2, a3
 ; RV64-NEXT:    slli a4, a4, 16
 ; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    or a2, a4, a2
+; RV64-NEXT:    or a0, a0, a4
 ; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    vmv.s.x v9, a0
 ; RV64-NEXT:    vslideup.vi v8, v9, 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
index b86c582ecb496..e40a42b621406 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll
@@ -378,8 +378,8 @@ define zeroext i1 @vpreduce_or_nxv128i1(i1 zeroext %s, <vscale x 128 x i1> %v, <
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vcpop.m a1, v11, v0.t
 ; CHECK-NEXT:    snez a1, a1
+; CHECK-NEXT:    or a0, a1, a0
 ; CHECK-NEXT:    or a0, a3, a0
-; CHECK-NEXT:    or a0, a0, a1
 ; CHECK-NEXT:    ret
   %r = call i1 @llvm.vp.reduce.or.nxv128i1(i1 %s, <vscale x 128 x i1> %v, <vscale x 128 x i1> %m, i32 %evl)
   ret i1 %r

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll
index 042084df1d816..3d7668b860ddf 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll
@@ -59,10 +59,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a4, a1
 ; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    add a5, a4, a3
 ; RV32I-NEXT:    add a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    add a1, a5, a1
 ; RV32I-NEXT:    xor a2, a4, a1
 ; RV32I-NEXT:    xor a3, a4, a3
 ; RV32I-NEXT:    not a3, a3
@@ -94,10 +94,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    mv a4, a1
 ; RV32IZbb-NEXT:    mv a1, a0
+; RV32IZbb-NEXT:    add a5, a4, a3
 ; RV32IZbb-NEXT:    add a0, a0, a2
 ; RV32IZbb-NEXT:    sltu a1, a0, a1
-; RV32IZbb-NEXT:    add a1, a3, a1
-; RV32IZbb-NEXT:    add a1, a4, a1
+; RV32IZbb-NEXT:    add a1, a5, a1
 ; RV32IZbb-NEXT:    xor a2, a4, a1
 ; RV32IZbb-NEXT:    xor a3, a4, a3
 ; RV32IZbb-NEXT:    andn a2, a2, a3

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
index 3e2ccba35f27e..aa9496007adcb 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -65,10 +65,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    mv a2, a1
 ; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    add a3, a2, a5
 ; RV32I-NEXT:    add a0, a0, a4
 ; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a1, a5, a1
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    xor a3, a2, a1
 ; RV32I-NEXT:    xor a2, a2, a5
 ; RV32I-NEXT:    not a2, a2
@@ -100,10 +100,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    mv a2, a1
 ; RV32IZbb-NEXT:    mv a1, a0
+; RV32IZbb-NEXT:    add a3, a2, a5
 ; RV32IZbb-NEXT:    add a0, a0, a4
 ; RV32IZbb-NEXT:    sltu a1, a0, a1
-; RV32IZbb-NEXT:    add a1, a5, a1
-; RV32IZbb-NEXT:    add a1, a2, a1
+; RV32IZbb-NEXT:    add a1, a3, a1
 ; RV32IZbb-NEXT:    xor a3, a2, a1
 ; RV32IZbb-NEXT:    xor a2, a2, a5
 ; RV32IZbb-NEXT:    andn a2, a3, a2

diff  --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
index 00ead4d7d7c9f..3d3cdd78cbf39 100644
--- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll
+++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
@@ -272,7 +272,7 @@ define i64 @add_select_all_zeros_i64(i1 zeroext %c, i64 %x, i64 %y) {
 ; RV32I-NEXT:    and a1, a0, a1
 ; RV32I-NEXT:    add a0, a1, a3
 ; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
@@ -343,7 +343,7 @@ define i64 @sub_select_all_zeros_i64(i1 zeroext %c, i64 %x, i64 %y) {
 ; RV32I-NEXT:    and a2, a0, a2
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    sltu a1, a3, a0
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a3, a0
 ; RV32I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
index 9a4766dafd362..51df390df802d 100644
--- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll
+++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
@@ -88,8 +88,8 @@ define i32 @f4() shadowcallstack {
 ; RV32-NEXT:    call bar at plt
 ; RV32-NEXT:    mv s3, a0
 ; RV32-NEXT:    call bar at plt
+; RV32-NEXT:    add s0, s0, s1
 ; RV32-NEXT:    add a0, s3, a0
-; RV32-NEXT:    add a0, s1, a0
 ; RV32-NEXT:    add a0, s0, a0
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -121,8 +121,8 @@ define i32 @f4() shadowcallstack {
 ; RV64-NEXT:    call bar at plt
 ; RV64-NEXT:    mv s3, a0
 ; RV64-NEXT:    call bar at plt
+; RV64-NEXT:    add s0, s0, s1
 ; RV64-NEXT:    add a0, s3, a0
-; RV64-NEXT:    add a0, s1, a0
 ; RV64-NEXT:    addw a0, s0, a0
 ; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 2884531585700..5746fcf788cda 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -215,8 +215,8 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    srl a3, a3, a2
 ; RV32I-NEXT:    lbu a4, 5(a1)
@@ -227,8 +227,8 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    sll a5, a5, a6
@@ -242,8 +242,8 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a5, t1, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    not t0, a2
 ; RV32I-NEXT:    lbu t1, 13(a1)
@@ -257,7 +257,7 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    srl a5, a5, a2
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
@@ -362,8 +362,8 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    srl a3, a3, a2
 ; RV32I-NEXT:    lbu a4, 5(a1)
@@ -374,8 +374,8 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, a5, a4
 ; RV32I-NEXT:    slli a5, a4, 1
 ; RV32I-NEXT:    xori a6, a2, 31
 ; RV32I-NEXT:    sll a5, a5, a6
@@ -389,8 +389,8 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a5, t1, a5
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    not t0, a2
 ; RV32I-NEXT:    lbu t1, 13(a1)
@@ -404,7 +404,7 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    srl a5, a5, a2
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or a1, a1, t0
 ; RV32I-NEXT:    or a1, a1, a7
 ; RV32I-NEXT:    slli a7, a1, 1
 ; RV32I-NEXT:    sll a6, a7, a6
@@ -504,8 +504,8 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    andi a2, a2, 7
 ; RV32I-NEXT:    sll a4, a1, a2
 ; RV32I-NEXT:    lbu a5, 1(a3)
@@ -516,8 +516,8 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a6, a5, 1
 ; RV32I-NEXT:    xori a7, a2, 31
 ; RV32I-NEXT:    srl a6, a6, a7
@@ -530,8 +530,8 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or a6, a6, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a6, t0, a6
 ; RV32I-NEXT:    sll t0, a6, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    not t1, a2
@@ -545,7 +545,7 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    or t0, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or t0, t2, t0
+; RV32I-NEXT:    or a3, a3, t2
 ; RV32I-NEXT:    or a3, a3, t0
 ; RV32I-NEXT:    sll a3, a3, a2
 ; RV32I-NEXT:    srli a6, a6, 1

diff  --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index c96c57d2d5e8e..f1172a7314682 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -21,11 +21,11 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
 ; RV32-NEXT:    addi a3, a3, -1366
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
@@ -46,11 +46,11 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a4, a0, a3
 ; RV64-NEXT:    mul a5, a4, a6
 ; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a2
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 3
@@ -74,11 +74,11 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
 ; RV32-NEXT:    addi a3, a3, -820
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
@@ -99,11 +99,11 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a4, a0, a3
 ; RV64-NEXT:    mul a5, a4, a6
 ; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a2
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 5
@@ -181,11 +181,11 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
 ; RV32-NEXT:    mul a5, a3, a5
 ; RV32-NEXT:    addi a4, a4, -273
 ; RV32-NEXT:    mulhu a6, a3, a4
+; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    mul a0, a3, a4
 ; RV32-NEXT:    ret
 ;
@@ -208,11 +208,11 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a3, a0, a2
 ; RV64-NEXT:    mul a4, a3, a4
 ; RV64-NEXT:    mulhu a6, a3, a5
+; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a5
-; RV64-NEXT:    add a0, a4, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a5
+; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 15
@@ -236,11 +236,11 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
 ; RV32-NEXT:    addi a3, a3, 240
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
@@ -261,11 +261,11 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a4, a0, a3
 ; RV64-NEXT:    mul a5, a4, a6
 ; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a2
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 17
@@ -291,11 +291,11 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
 ; RV32-NEXT:    mul a5, a3, a5
 ; RV32-NEXT:    addi a4, a4, -257
 ; RV32-NEXT:    mulhu a6, a3, a4
+; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    mul a0, a3, a4
 ; RV32-NEXT:    ret
 ;
@@ -318,11 +318,11 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a3, a0, a2
 ; RV64-NEXT:    mul a4, a3, a4
 ; RV64-NEXT:    mulhu a6, a3, a5
+; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a5
-; RV64-NEXT:    add a0, a4, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a5
+; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 255
@@ -346,11 +346,11 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
 ; RV32-NEXT:    addi a3, a3, -256
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
@@ -371,11 +371,11 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a4, a0, a3
 ; RV64-NEXT:    mul a5, a4, a6
 ; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a2
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 257
@@ -401,12 +401,12 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
 ; RV32-NEXT:    mul a5, a3, a5
 ; RV32-NEXT:    addi a4, a4, -1
 ; RV32-NEXT:    mulhu a4, a3, a4
+; RV32-NEXT:    add a4, a4, a5
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
 ; RV32-NEXT:    slli a0, a1, 16
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sub a1, a5, a0
-; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    sub a1, a4, a0
 ; RV32-NEXT:    slli a0, a3, 16
 ; RV32-NEXT:    neg a2, a3
 ; RV32-NEXT:    sub a0, a2, a0
@@ -433,11 +433,11 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
 ; RV64-NEXT:    mul a5, a3, a5
 ; RV64-NEXT:    addi a4, a4, -1
 ; RV64-NEXT:    mulhu a6, a3, a4
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a4
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a4
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a3, a4
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 65535
@@ -460,12 +460,12 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 ; RV32-NEXT:    sub a3, a0, a2
 ; RV32-NEXT:    mulhu a4, a3, a4
 ; RV32-NEXT:    slli a5, a3, 16
+; RV32-NEXT:    sub a4, a4, a5
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
 ; RV32-NEXT:    slli a0, a1, 16
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    sub a0, a5, a1
-; RV32-NEXT:    sub a1, a4, a0
+; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    sub a0, a3, a5
 ; RV32-NEXT:    ret
 ;
@@ -488,11 +488,11 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a5, a0, a2
 ; RV64-NEXT:    mul a3, a5, a3
 ; RV64-NEXT:    mulhu a6, a5, a4
+; RV64-NEXT:    add a3, a6, a3
 ; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a4
-; RV64-NEXT:    add a0, a3, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a4
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    mul a0, a5, a4
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 65537
@@ -520,11 +520,11 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32-NEXT:    addi a3, a3, -1366
 ; RV32-NEXT:    mul a3, a5, a3
 ; RV32-NEXT:    mulhu a6, a5, a4
+; RV32-NEXT:    add a3, a6, a3
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a0, a1, a4
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a1, a6, a0
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    mul a0, a5, a4
 ; RV32-NEXT:    ret
 ;
@@ -549,11 +549,11 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV64-NEXT:    sub a4, a0, a3
 ; RV64-NEXT:    mul a5, a4, a6
 ; RV64-NEXT:    mulhu a6, a4, a2
+; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a3
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a0, a1, a2
-; RV64-NEXT:    add a0, a5, a0
-; RV64-NEXT:    add a1, a6, a0
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a5, a1
 ; RV64-NEXT:    mul a0, a4, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12

diff  --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll
index 38e75acecea1a..1b8dee85296b0 100644
--- a/llvm/test/CodeGen/RISCV/srem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll
@@ -240,7 +240,7 @@ define i32 @combine_srem_sdiv(i32 %x) nounwind {
 ; RV32IM-NEXT:    add a1, a1, a2
 ; RV32IM-NEXT:    li a2, 95
 ; RV32IM-NEXT:    mul a2, a1, a2
-; RV32IM-NEXT:    sub a2, a2, a1
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    sub a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
@@ -278,7 +278,7 @@ define i32 @combine_srem_sdiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    add a1, a1, a2
 ; RV64IM-NEXT:    li a2, 95
 ; RV64IM-NEXT:    mulw a2, a1, a2
-; RV64IM-NEXT:    subw a2, a2, a1
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    subw a0, a0, a2
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, 95

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index b41d8194d94b1..9e00b1282796a 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -366,8 +366,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    slli a0, a0, 2
-; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    sw a0, 8(s0)
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -516,8 +516,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    andi a1, a1, 1
 ; RV32M-NEXT:    slli a1, a1, 1
 ; RV32M-NEXT:    slli a0, a0, 2
-; RV32M-NEXT:    or a0, a1, a0
 ; RV32M-NEXT:    or a0, a2, a0
+; RV32M-NEXT:    or a0, a0, a1
 ; RV32M-NEXT:    sw a0, 8(s0)
 ; RV32M-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32M-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -561,8 +561,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M-NEXT:    srai a4, a4, 1
 ; RV64M-NEXT:    add a4, a4, a5
 ; RV64M-NEXT:    slli a5, a4, 3
-; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    sub a3, a3, a5
 ; RV64M-NEXT:    addi a3, a3, -1
 ; RV64M-NEXT:    seqz a3, a3
 ; RV64M-NEXT:    lui a4, %hi(.LCPI3_2)
@@ -691,8 +691,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    andi a2, a2, 1
 ; RV32MV-NEXT:    slli a2, a2, 1
 ; RV32MV-NEXT:    slli a0, a0, 2
-; RV32MV-NEXT:    or a0, a2, a0
 ; RV32MV-NEXT:    or a0, a1, a0
+; RV32MV-NEXT:    or a0, a0, a2
 ; RV32MV-NEXT:    sw a0, 8(s2)
 ; RV32MV-NEXT:    addi sp, s0, -64
 ; RV32MV-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
@@ -743,8 +743,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    slli a4, a3, 3
 ; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
 ; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    sub a2, a2, a4
 ; RV64MV-NEXT:    add a2, a2, a3
+; RV64MV-NEXT:    sub a2, a2, a4
 ; RV64MV-NEXT:    sd a2, 8(sp)
 ; RV64MV-NEXT:    mulh a2, a1, a5
 ; RV64MV-NEXT:    srli a3, a2, 63

diff  --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index cbb0a6506793b..50d4a55842f1a 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -476,14 +476,14 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srai a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, t5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    sub a5, a7, a5
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sub a5, t4, t3
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sub a5, t2, t1
-; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    sub a5, t0, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    sub a2, a2, a7
+; RV32IM-NEXT:    add a3, a3, t3
+; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a1, a1, t1
+; RV32IM-NEXT:    sub a1, a1, t2
+; RV32IM-NEXT:    add a4, a4, a6
+; RV32IM-NEXT:    sub a4, a4, t0
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
 ; RV32IM-NEXT:    sh a3, 2(a0)
@@ -593,18 +593,18 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    srai a3, a3, 6
 ; RV64IM-NEXT:    add a3, a3, t5
 ; RV64IM-NEXT:    mulw a7, a3, a7
-; RV64IM-NEXT:    subw a3, a7, a3
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    subw a3, t4, t3
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    subw a3, t2, t1
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    subw a3, t0, a6
-; RV64IM-NEXT:    subw a2, a2, a3
+; RV64IM-NEXT:    add a3, a4, a3
+; RV64IM-NEXT:    subw a3, a3, a7
+; RV64IM-NEXT:    add a5, a5, t3
+; RV64IM-NEXT:    subw a4, a5, t4
+; RV64IM-NEXT:    add a1, a1, t1
+; RV64IM-NEXT:    subw a1, a1, t2
+; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    subw a2, a2, t0
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a5, 2(a0)
-; RV64IM-NEXT:    sh a4, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>

diff  --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll
index ed3f240468677..9571ca317316a 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll
@@ -59,8 +59,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a4, a1
 ; RV32-NEXT:    sltu a1, a0, a2
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    sub a1, a4, a1
+; RV32-NEXT:    sub a5, a4, a3
+; RV32-NEXT:    sub a1, a5, a1
 ; RV32-NEXT:    xor a5, a4, a1
 ; RV32-NEXT:    xor a3, a4, a3
 ; RV32-NEXT:    and a3, a3, a5

diff  --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index 064c4f94ba1f4..32e794fc778e6 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -65,8 +65,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    mv a2, a1
 ; RV32-NEXT:    sltu a1, a0, a4
-; RV32-NEXT:    add a1, a5, a1
-; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    sub a3, a2, a5
+; RV32-NEXT:    sub a1, a3, a1
 ; RV32-NEXT:    xor a3, a2, a1
 ; RV32-NEXT:    xor a2, a2, a5
 ; RV32-NEXT:    and a2, a2, a3

diff  --git a/llvm/test/CodeGen/RISCV/uadd_sat.ll b/llvm/test/CodeGen/RISCV/uadd_sat.ll
index 6da9efcc7f68f..dbcb68eb3c9eb 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat.ll
@@ -47,10 +47,10 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: func2:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a3, a1, a3
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    add a3, a3, a0
-; RV32I-NEXT:    add a3, a1, a3
 ; RV32I-NEXT:    beq a3, a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a0, a3, a1
@@ -70,10 +70,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ;
 ; RV32IZbb-LABEL: func2:
 ; RV32IZbb:       # %bb.0:
+; RV32IZbb-NEXT:    add a3, a1, a3
 ; RV32IZbb-NEXT:    add a2, a0, a2
 ; RV32IZbb-NEXT:    sltu a0, a2, a0
 ; RV32IZbb-NEXT:    add a3, a3, a0
-; RV32IZbb-NEXT:    add a3, a1, a3
 ; RV32IZbb-NEXT:    beq a3, a1, .LBB1_2
 ; RV32IZbb-NEXT:  # %bb.1:
 ; RV32IZbb-NEXT:    sltu a0, a3, a1

diff  --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
index 9df19570de627..7a8c4c04fd0e1 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -54,10 +54,10 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-LABEL: func64:
 ; RV32I:       # %bb.0:
+; RV32I-NEXT:    add a2, a1, a5
 ; RV32I-NEXT:    add a4, a0, a4
 ; RV32I-NEXT:    sltu a0, a4, a0
-; RV32I-NEXT:    add a2, a5, a0
-; RV32I-NEXT:    add a2, a1, a2
+; RV32I-NEXT:    add a2, a2, a0
 ; RV32I-NEXT:    beq a2, a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu a0, a2, a1
@@ -77,10 +77,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ;
 ; RV32IZbb-LABEL: func64:
 ; RV32IZbb:       # %bb.0:
+; RV32IZbb-NEXT:    add a2, a1, a5
 ; RV32IZbb-NEXT:    add a4, a0, a4
 ; RV32IZbb-NEXT:    sltu a0, a4, a0
-; RV32IZbb-NEXT:    add a2, a5, a0
-; RV32IZbb-NEXT:    add a2, a1, a2
+; RV32IZbb-NEXT:    add a2, a2, a0
 ; RV32IZbb-NEXT:    beq a2, a1, .LBB1_2
 ; RV32IZbb-NEXT:  # %bb.1:
 ; RV32IZbb-NEXT:    sltu a0, a2, a1

diff  --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 5e624d689128e..d57c62170699a 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,19 +10,19 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a4, 12(a1)
+; RISCV32-NEXT:    lw a3, 12(a1)
 ; RISCV32-NEXT:    lw a7, 12(a2)
 ; RISCV32-NEXT:    lw a6, 8(a1)
-; RISCV32-NEXT:    lw a3, 0(a2)
+; RISCV32-NEXT:    lw a4, 0(a2)
 ; RISCV32-NEXT:    lw a5, 0(a1)
 ; RISCV32-NEXT:    lw t3, 4(a1)
 ; RISCV32-NEXT:    lw t0, 8(a2)
 ; RISCV32-NEXT:    lw a2, 4(a2)
-; RISCV32-NEXT:    mulhu a1, a5, a3
-; RISCV32-NEXT:    mul t1, t3, a3
+; RISCV32-NEXT:    mulhu a1, a5, a4
+; RISCV32-NEXT:    mul t1, t3, a4
 ; RISCV32-NEXT:    add a1, t1, a1
 ; RISCV32-NEXT:    sltu t1, a1, t1
-; RISCV32-NEXT:    mulhu t2, t3, a3
+; RISCV32-NEXT:    mulhu t2, t3, a4
 ; RISCV32-NEXT:    add t4, t2, t1
 ; RISCV32-NEXT:    mul t1, a5, a2
 ; RISCV32-NEXT:    add a1, t1, a1
@@ -33,65 +33,65 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    mul t6, t3, a2
 ; RISCV32-NEXT:    add s0, t6, t5
 ; RISCV32-NEXT:    mul t1, t0, a5
-; RISCV32-NEXT:    mul s3, a6, a3
+; RISCV32-NEXT:    mul s3, a6, a4
 ; RISCV32-NEXT:    add s4, s3, t1
 ; RISCV32-NEXT:    add t1, s0, s4
 ; RISCV32-NEXT:    sltu t2, t1, s0
-; RISCV32-NEXT:    sltu t6, s0, t6
+; RISCV32-NEXT:    sltu s0, s0, t6
 ; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, t3, a2
-; RISCV32-NEXT:    add t4, t4, t6
-; RISCV32-NEXT:    add s1, s1, t4
+; RISCV32-NEXT:    mulhu t5, t3, a2
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    add s0, t4, s0
 ; RISCV32-NEXT:    mul t4, t3, t0
-; RISCV32-NEXT:    mul s2, a7, a5
-; RISCV32-NEXT:    mulhu s0, t0, a5
-; RISCV32-NEXT:    add t4, s0, t4
-; RISCV32-NEXT:    add s2, t4, s2
+; RISCV32-NEXT:    mul t5, a7, a5
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    mulhu s1, t0, a5
+; RISCV32-NEXT:    add s2, s1, t4
 ; RISCV32-NEXT:    mul t4, a2, a6
-; RISCV32-NEXT:    mul t6, a4, a3
-; RISCV32-NEXT:    mulhu t5, a6, a3
+; RISCV32-NEXT:    mul t5, a3, a4
 ; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    add t6, t4, t6
-; RISCV32-NEXT:    sltu t4, s4, s3
-; RISCV32-NEXT:    add t4, s2, t4
-; RISCV32-NEXT:    add t4, t6, t4
+; RISCV32-NEXT:    mulhu t5, a6, a4
+; RISCV32-NEXT:    add t6, t5, t4
+; RISCV32-NEXT:    add t4, t6, s2
+; RISCV32-NEXT:    sltu s3, s4, s3
+; RISCV32-NEXT:    add t4, t4, s3
+; RISCV32-NEXT:    add t4, s0, t4
 ; RISCV32-NEXT:    add t4, t4, t2
-; RISCV32-NEXT:    add t4, s1, t4
-; RISCV32-NEXT:    beq t4, s1, .LBB0_2
+; RISCV32-NEXT:    beq t4, s0, .LBB0_2
 ; RISCV32-NEXT:  # %bb.1: # %start
-; RISCV32-NEXT:    sltu t2, t4, s1
+; RISCV32-NEXT:    sltu t2, t4, s0
 ; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    sltu s0, s2, s0
+; RISCV32-NEXT:    sltu s0, s2, s1
 ; RISCV32-NEXT:    snez s1, t3
 ; RISCV32-NEXT:    snez s2, a7
 ; RISCV32-NEXT:    and s1, s2, s1
 ; RISCV32-NEXT:    mulhu s2, a7, a5
 ; RISCV32-NEXT:    snez s2, s2
+; RISCV32-NEXT:    or s1, s1, s2
 ; RISCV32-NEXT:    mulhu t3, t3, t0
 ; RISCV32-NEXT:    snez t3, t3
-; RISCV32-NEXT:    or t3, s2, t3
-; RISCV32-NEXT:    or t3, t3, s0
 ; RISCV32-NEXT:    or t3, s1, t3
+; RISCV32-NEXT:    or t3, t3, s0
 ; RISCV32-NEXT:    sltu t5, t6, t5
 ; RISCV32-NEXT:    snez t6, a2
-; RISCV32-NEXT:    snez s0, a4
+; RISCV32-NEXT:    snez s0, a3
 ; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a4, a3
+; RISCV32-NEXT:    mulhu s0, a3, a4
 ; RISCV32-NEXT:    snez s0, s0
+; RISCV32-NEXT:    or t6, t6, s0
 ; RISCV32-NEXT:    mulhu a2, a2, a6
 ; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    or a2, s0, a2
+; RISCV32-NEXT:    or a2, t6, a2
 ; RISCV32-NEXT:    or a2, a2, t5
 ; RISCV32-NEXT:    or a7, t0, a7
 ; RISCV32-NEXT:    snez a7, a7
-; RISCV32-NEXT:    or a4, a6, a4
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    and a4, a4, a7
-; RISCV32-NEXT:    or a2, a4, a2
-; RISCV32-NEXT:    or a4, t6, t3
-; RISCV32-NEXT:    or a4, a4, t2
-; RISCV32-NEXT:    or a2, a2, a4
-; RISCV32-NEXT:    mul a3, a5, a3
+; RISCV32-NEXT:    or a3, a6, a3
+; RISCV32-NEXT:    snez a3, a3
+; RISCV32-NEXT:    and a3, a3, a7
+; RISCV32-NEXT:    or a2, a3, a2
+; RISCV32-NEXT:    or a3, t3, t2
+; RISCV32-NEXT:    or a2, a2, a3
+; RISCV32-NEXT:    mul a3, a5, a4
 ; RISCV32-NEXT:    andi a2, a2, 1
 ; RISCV32-NEXT:    sw a3, 0(a0)
 ; RISCV32-NEXT:    sw a1, 4(a0)

diff  --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 191eeabf579f8..d46e6c680aeff 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -40,11 +40,11 @@ define i24 @load_i24(ptr %p) {
 ; NOMISALIGN-LABEL: load_i24:
 ; NOMISALIGN:       # %bb.0:
 ; NOMISALIGN-NEXT:    lbu a1, 1(a0)
-; NOMISALIGN-NEXT:    lb a2, 2(a0)
-; NOMISALIGN-NEXT:    lbu a0, 0(a0)
+; NOMISALIGN-NEXT:    lbu a2, 0(a0)
+; NOMISALIGN-NEXT:    lb a0, 2(a0)
 ; NOMISALIGN-NEXT:    slli a1, a1, 8
-; NOMISALIGN-NEXT:    slli a2, a2, 16
-; NOMISALIGN-NEXT:    or a0, a0, a2
+; NOMISALIGN-NEXT:    or a1, a1, a2
+; NOMISALIGN-NEXT:    slli a0, a0, 16
 ; NOMISALIGN-NEXT:    or a0, a1, a0
 ; NOMISALIGN-NEXT:    ret
 ;
@@ -70,7 +70,7 @@ define i32 @load_i32(ptr %p) {
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -84,7 +84,7 @@ define i32 @load_i32(ptr %p) {
 ; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -107,8 +107,8 @@ define i64 @load_i64(ptr %p) {
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a1, a3, a1
-; RV32I-NEXT:    or a2, a4, a1
+; RV32I-NEXT:    or a2, a4, a3
+; RV32I-NEXT:    or a2, a2, a1
 ; RV32I-NEXT:    lbu a1, 5(a0)
 ; RV32I-NEXT:    lbu a3, 4(a0)
 ; RV32I-NEXT:    lbu a4, 6(a0)
@@ -117,7 +117,7 @@ define i64 @load_i64(ptr %p) {
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
@@ -127,25 +127,25 @@ define i64 @load_i64(ptr %p) {
 ; RV64I-NEXT:    lbu a1, 1(a0)
 ; RV64I-NEXT:    lbu a2, 0(a0)
 ; RV64I-NEXT:    lbu a3, 2(a0)
+; RV64I-NEXT:    lbu a4, 3(a0)
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    lbu a2, 5(a0)
-; RV64I-NEXT:    lbu a4, 3(a0)
+; RV64I-NEXT:    slli a4, a4, 24
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    lbu a2, 5(a0)
 ; RV64I-NEXT:    lbu a3, 4(a0)
-; RV64I-NEXT:    slli a2, a2, 8
-; RV64I-NEXT:    lbu a5, 6(a0)
+; RV64I-NEXT:    lbu a4, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a2, a5, a2
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    ret
 ;
 ; MISALIGN-RV32I-LABEL: load_i64:

diff  --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index 218281fb5b771..73ec7d23f554f 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -140,7 +140,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV32IM-NEXT:    srli a1, a1, 6
 ; RV32IM-NEXT:    li a2, 95
 ; RV32IM-NEXT:    mul a2, a1, a2
-; RV32IM-NEXT:    sub a2, a2, a1
+; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    sub a0, a0, a2
 ; RV32IM-NEXT:    ret
 ;
@@ -180,7 +180,7 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    li a2, 95
 ; RV64IM-NEXT:    mulw a2, a1, a2
-; RV64IM-NEXT:    subw a2, a2, a1
+; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    subw a0, a0, a2
 ; RV64IM-NEXT:    ret
   %1 = urem i32 %x, 95

diff  --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index ccd509918b3ae..1ac6179f62e45 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -579,8 +579,8 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    andi a3, a3, 2047
 ; RV32MV-NEXT:    slli a3, a3, 11
 ; RV32MV-NEXT:    slli a1, a1, 22
-; RV32MV-NEXT:    or a1, a3, a1
 ; RV32MV-NEXT:    or a1, a2, a1
+; RV32MV-NEXT:    or a1, a1, a3
 ; RV32MV-NEXT:    sw a1, 0(a0)
 ; RV32MV-NEXT:    addi sp, sp, 16
 ; RV32MV-NEXT:    ret
@@ -641,7 +641,7 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 2
 ; RV64MV-NEXT:    vmv.x.s a3, v8
 ; RV64MV-NEXT:    slli a3, a3, 22
-; RV64MV-NEXT:    or a2, a2, a3
+; RV64MV-NEXT:    or a1, a1, a3
 ; RV64MV-NEXT:    or a1, a1, a2
 ; RV64MV-NEXT:    sw a1, 0(a0)
 ; RV64MV-NEXT:    slli a1, a1, 31

diff  --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index d1622728db47a..b2e50f9fc7c5b 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -401,14 +401,14 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    mul t4, t3, a7
 ; RV32IM-NEXT:    mulhu a5, a2, a5
 ; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    sub a5, a7, a5
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sub a5, t4, t3
-; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    sub a5, t2, t1
-; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    sub a5, t0, a6
-; RV32IM-NEXT:    sub a4, a4, a5
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    sub a2, a2, a7
+; RV32IM-NEXT:    add a3, a3, t3
+; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    add a1, a1, t1
+; RV32IM-NEXT:    sub a1, a1, t2
+; RV32IM-NEXT:    add a4, a4, a6
+; RV32IM-NEXT:    sub a4, a4, t0
 ; RV32IM-NEXT:    sh a4, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
 ; RV32IM-NEXT:    sh a3, 2(a0)
@@ -502,18 +502,18 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulw t4, t3, a7
 ; RV64IM-NEXT:    mulhu a3, a4, a3
 ; RV64IM-NEXT:    mulw a7, a3, a7
-; RV64IM-NEXT:    subw a3, a7, a3
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    subw a3, t4, t3
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    subw a3, t2, t1
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    subw a3, t0, a6
-; RV64IM-NEXT:    subw a2, a2, a3
+; RV64IM-NEXT:    add a3, a4, a3
+; RV64IM-NEXT:    subw a3, a3, a7
+; RV64IM-NEXT:    add a5, a5, t3
+; RV64IM-NEXT:    subw a4, a5, t4
+; RV64IM-NEXT:    add a1, a1, t1
+; RV64IM-NEXT:    subw a1, a1, t2
+; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    subw a2, a2, t0
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a5, 2(a0)
-; RV64IM-NEXT:    sh a4, 0(a0)
+; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>

diff  --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll
index c828886c42f2a..aab5626576427 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat.ll
@@ -46,8 +46,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-LABEL: func2:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
-; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sub a2, a0, a2
 ; RV32I-NEXT:    beq a3, a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
@@ -72,8 +72,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbb-LABEL: func2:
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sltu a4, a0, a2
-; RV32IZbb-NEXT:    add a3, a3, a4
 ; RV32IZbb-NEXT:    sub a3, a1, a3
+; RV32IZbb-NEXT:    sub a3, a3, a4
 ; RV32IZbb-NEXT:    sub a2, a0, a2
 ; RV32IZbb-NEXT:    beq a3, a1, .LBB1_2
 ; RV32IZbb-NEXT:  # %bb.1:

diff  --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index 50ec3528b68d4..6f868b328b7cc 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -53,8 +53,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-LABEL: func64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a2, a0, a4
-; RV32I-NEXT:    add a2, a5, a2
-; RV32I-NEXT:    sub a2, a1, a2
+; RV32I-NEXT:    sub a3, a1, a5
+; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    sub a3, a0, a4
 ; RV32I-NEXT:    beq a2, a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
@@ -79,8 +79,8 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbb-LABEL: func64:
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sltu a2, a0, a4
-; RV32IZbb-NEXT:    add a2, a5, a2
-; RV32IZbb-NEXT:    sub a2, a1, a2
+; RV32IZbb-NEXT:    sub a3, a1, a5
+; RV32IZbb-NEXT:    sub a2, a3, a2
 ; RV32IZbb-NEXT:    sub a3, a0, a4
 ; RV32IZbb-NEXT:    beq a2, a1, .LBB1_2
 ; RV32IZbb-NEXT:  # %bb.1:

diff  --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 69703a4359b38..df8a6706997b7 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -808,11 +808,11 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a3, sp, 27
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a4, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a3
+; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 4(a0)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a3
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-FPELIM-NEXT:    sltu a1, a0, a1
-; ILP32-ILP32F-FPELIM-NEXT:    add a1, a4, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 32
 ; ILP32-ILP32F-FPELIM-NEXT:    ret
@@ -832,11 +832,11 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a3, s0, 19
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a3, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a4, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a3
+; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 4(a0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a3
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-WITHFP-NEXT:    sltu a1, a0, a1
-; ILP32-ILP32F-WITHFP-NEXT:    add a1, a4, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
@@ -855,11 +855,11 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -8
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, sp, 27
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a4, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 4(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a3
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sltu a1, a0, a1
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a4, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a2, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 32
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    ret
@@ -951,7 +951,7 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a4, 4(a0)
 ; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a3
 ; ILP32-ILP32F-FPELIM-NEXT:    sltu a1, a0, a1
-; ILP32-ILP32F-FPELIM-NEXT:    add a1, a4, a1
+; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a4
 ; ILP32-ILP32F-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 32
 ; ILP32-ILP32F-FPELIM-NEXT:    ret
@@ -977,7 +977,7 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a4, 4(a0)
 ; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a3
 ; ILP32-ILP32F-WITHFP-NEXT:    sltu a1, a0, a1
-; ILP32-ILP32F-WITHFP-NEXT:    add a1, a4, a1
+; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a4
 ; ILP32-ILP32F-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 16(sp) # 4-byte Folded Reload
@@ -998,11 +998,11 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 20(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    fld ft0, 0(a0)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    fsd ft0, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 8(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a3
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sltu a1, a0, a1
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a3, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a2, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    ret
@@ -1164,8 +1164,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a3, a0, 4
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 4(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    add a2, s0, a2
-; ILP32-ILP32F-FPELIM-NEXT:    add a0, a2, a0
+; ILP32-ILP32F-FPELIM-NEXT:    add a1, a1, s0
+; ILP32-ILP32F-FPELIM-NEXT:    add a1, a1, a2
 ; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-FPELIM-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1207,8 +1207,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a3, a0, 4
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a3, -16(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    add a2, s1, a2
-; ILP32-ILP32F-WITHFP-NEXT:    add a0, a2, a0
+; ILP32-ILP32F-WITHFP-NEXT:    add a1, a1, s1
+; ILP32-ILP32F-WITHFP-NEXT:    add a1, a1, a2
 ; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1249,8 +1249,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, a0, 4
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 4(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, s0, a2
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a2, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a1, s0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a1, a2
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1290,8 +1290,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, a0, 8
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a2, s0, a2
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, a2, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, s0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, a2
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a0, a1, a0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -1333,8 +1333,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, a0, 8
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, -32(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a2, s1, a2
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a0, a2, a0
+; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, s1
+; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, a2
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a0, a1, a0
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index 3c8bf9cf01c8d..ba0f1835057cb 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -14,7 +14,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    srlw a0, a0, a1
@@ -37,7 +37,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -47,7 +47,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
@@ -78,7 +78,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    sllw a0, a0, a1
@@ -101,7 +101,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -111,7 +111,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
@@ -142,7 +142,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    sraw a0, a0, a1
@@ -165,7 +165,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -175,7 +175,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
@@ -201,25 +201,25 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
 ; RV64I-NEXT:    lbu a5, 6(a1)
@@ -228,8 +228,8 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
@@ -238,7 +238,7 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -271,8 +271,8 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
 ; RV32I-NEXT:    lbu a6, 2(a1)
@@ -281,7 +281,7 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
@@ -299,7 +299,7 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:    srl a0, a0, a5
 ; RV32I-NEXT:    slli a3, a3, 1
@@ -338,25 +338,25 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
 ; RV64I-NEXT:    lbu a5, 6(a1)
@@ -365,8 +365,8 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
@@ -375,7 +375,7 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -408,8 +408,8 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
 ; RV32I-NEXT:    lbu a6, 2(a1)
@@ -418,7 +418,7 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a5, a1, 3
 ; RV32I-NEXT:    addi a4, a5, -32
@@ -436,7 +436,7 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:    sll a0, a0, a5
 ; RV32I-NEXT:    srli a3, a3, 1
@@ -475,25 +475,25 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a3, 5(a1)
 ; RV64I-NEXT:    lbu a4, 4(a1)
 ; RV64I-NEXT:    lbu a5, 6(a1)
@@ -502,8 +502,8 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
 ; RV64I-NEXT:    lbu a6, 2(a1)
@@ -512,7 +512,7 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a3, a3, 35
@@ -540,46 +540,46 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a3, 5(a0)
 ; RV32I-NEXT:    lbu a4, 4(a0)
 ; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    lbu a4, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    slli a4, a6, 24
+; RV32I-NEXT:    or a5, a4, a5
 ; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    slli a4, a1, 3
-; RV32I-NEXT:    addi a6, a4, -32
-; RV32I-NEXT:    sra a1, a3, a4
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    slli a5, a1, 3
+; RV32I-NEXT:    addi a6, a5, -32
+; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a5, a5, 31
+; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srl a0, a0, a5
 ; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    not a4, a5
 ; RV32I-NEXT:    sll a3, a3, a4
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:  .LBB5_3:
@@ -612,25 +612,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 9(a0)
 ; RV64I-NEXT:    lbu a4, 8(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
+; RV64I-NEXT:    lbu a6, 11(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 13(a0)
-; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 12(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
 ; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
 ; RV64I-NEXT:    lbu a6, 6(a1)
@@ -639,8 +639,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 0(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
@@ -649,7 +649,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -664,25 +664,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 0(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    or a6, t0, a6
 ; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    srl a0, a0, a5
 ; RV64I-NEXT:    not a5, a5
 ; RV64I-NEXT:    slli a3, a3, 1
@@ -832,25 +832,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a3, 1(a0)
 ; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
+; RV64I-NEXT:    lbu a6, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a3, a3, a6
 ; RV64I-NEXT:    lbu a4, 5(a1)
 ; RV64I-NEXT:    lbu a5, 4(a1)
 ; RV64I-NEXT:    lbu a6, 6(a1)
@@ -859,8 +859,8 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a4, a4, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    lbu a5, 1(a1)
 ; RV64I-NEXT:    lbu a6, 0(a1)
 ; RV64I-NEXT:    lbu a7, 2(a1)
@@ -869,7 +869,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a4, a4, 35
@@ -884,25 +884,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a6, 9(a0)
 ; RV64I-NEXT:    lbu a7, 8(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    or a6, t0, a6
 ; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t2, 14(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    sll a0, a0, a5
 ; RV64I-NEXT:    not a5, a5
 ; RV64I-NEXT:    srli a3, a3, 1
@@ -1057,20 +1057,20 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
 ; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a3, a3, a6
 ; RV64I-NEXT:    lbu a5, 5(a1)
 ; RV64I-NEXT:    lbu a6, 4(a1)
 ; RV64I-NEXT:    lbu a7, 6(a1)
@@ -1079,8 +1079,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, a7, a5
-; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    lbu a6, 1(a1)
 ; RV64I-NEXT:    lbu a7, 0(a1)
 ; RV64I-NEXT:    lbu t0, 2(a1)
@@ -1089,7 +1089,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    slli a5, a5, 35
@@ -1106,25 +1106,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a6, 0(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a6
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t0, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    srl a0, a0, a5
 ; RV64I-NEXT:    not a4, a5
 ; RV64I-NEXT:    slli a3, a3, 1

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index c26c13d61afe7..a0e5999fed2c9 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -14,7 +14,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
@@ -36,7 +36,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -46,7 +46,7 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -75,7 +75,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
@@ -97,7 +97,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -107,7 +107,7 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -136,7 +136,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    sraw a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
@@ -158,7 +158,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a0, a0, a5
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    lbu a3, 1(a1)
 ; RV32I-NEXT:    lbu a4, 0(a1)
@@ -168,7 +168,7 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    sb a0, 0(a2)
@@ -197,42 +197,42 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 3(a1)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -261,8 +261,8 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
 ; RV32I-NEXT:    lbu a6, 2(a1)
@@ -271,8 +271,8 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a6, a4
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    or a5, a5, a4
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    srl a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB3_2
@@ -288,7 +288,7 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:    srl a0, a0, a5
 ; RV32I-NEXT:    not a5, a5
@@ -331,42 +331,42 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 3(a1)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -395,8 +395,8 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu a5, 0(a1)
 ; RV32I-NEXT:    lbu a6, 2(a1)
@@ -405,8 +405,8 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a4, a4, a5
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a5, a6, a4
-; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    or a5, a1, a6
+; RV32I-NEXT:    or a5, a5, a4
 ; RV32I-NEXT:    addi a4, a5, -32
 ; RV32I-NEXT:    sll a1, a3, a5
 ; RV32I-NEXT:    bltz a4, .LBB4_2
@@ -422,7 +422,7 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:    sll a0, a0, a5
 ; RV32I-NEXT:    not a5, a5
@@ -465,42 +465,42 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    lbu a3, 1(a1)
 ; RV64I-NEXT:    lbu a4, 0(a1)
 ; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    lbu a6, 3(a1)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    lbu a6, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sra a0, a0, a1
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    srli a1, a0, 48
@@ -524,44 +524,44 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a3, 5(a0)
 ; RV32I-NEXT:    lbu a4, 4(a0)
 ; RV32I-NEXT:    lbu a5, 6(a0)
+; RV32I-NEXT:    lbu a6, 7(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    or a3, a3, a4
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    lbu a4, 1(a1)
-; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    slli a4, a6, 24
+; RV32I-NEXT:    or a5, a4, a5
 ; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a6
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    lbu a5, 1(a1)
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a4, a1, a4
-; RV32I-NEXT:    addi a6, a4, -32
-; RV32I-NEXT:    sra a1, a3, a4
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a5, a1, a5
+; RV32I-NEXT:    addi a6, a5, -32
+; RV32I-NEXT:    sra a1, a3, a5
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a5, a5, 31
+; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a1, a4
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a5, 1(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a4, a4, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a5, a7, a5
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    srl a0, a0, a4
-; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    or a0, a0, a7
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srl a0, a0, a5
+; RV32I-NEXT:    not a4, a5
 ; RV32I-NEXT:    slli a3, a3, 1
 ; RV32I-NEXT:    sll a3, a3, a4
 ; RV32I-NEXT:    or a0, a0, a3
@@ -599,42 +599,42 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    or a3, a3, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a7, 3(a1)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or a5, a5, a7
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a5, a1, a6
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    srl a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB6_2
@@ -645,25 +645,25 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a6, 1(a0)
 ; RV64I-NEXT:    lbu a7, 0(a0)
 ; RV64I-NEXT:    lbu t0, 2(a0)
+; RV64I-NEXT:    lbu t1, 3(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 5(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
-; RV64I-NEXT:    or a6, t0, a6
 ; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t2, 6(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    srl a0, a0, a5
 ; RV64I-NEXT:    not a5, a5
 ; RV64I-NEXT:    slli a3, a3, 1
@@ -737,7 +737,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a0, 15(a0)
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, s1
 ; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    sb zero, 43(sp)
 ; RV32I-NEXT:    sb zero, 42(sp)
@@ -783,8 +783,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a4, a4, a0
 ; RV32I-NEXT:    andi a5, a1, 7
 ; RV32I-NEXT:    srl a0, a4, a5
 ; RV32I-NEXT:    lbu a1, 9(a3)
@@ -795,8 +795,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a6, a6, a1
 ; RV32I-NEXT:    slli a1, a6, 1
 ; RV32I-NEXT:    not a7, a5
 ; RV32I-NEXT:    sll a1, a1, a7
@@ -809,8 +809,8 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    or a7, t2, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    srl a7, a7, a5
 ; RV32I-NEXT:    slli a4, a4, 1
 ; RV32I-NEXT:    xori t0, a5, 31
@@ -825,7 +825,7 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t3
 ; RV32I-NEXT:    or a3, a3, t1
 ; RV32I-NEXT:    slli t1, a3, 1
 ; RV32I-NEXT:    sll t0, t1, t0
@@ -883,42 +883,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 5(a0)
 ; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    lbu a6, 6(a0)
+; RV64I-NEXT:    lbu a7, 7(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 1(a1)
 ; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    or a3, a3, a6
+; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu a7, 3(a1)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or a5, a5, a7
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a5, t0, a5
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    or a1, a1, a5
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a5, a1, a6
+; RV64I-NEXT:    or a5, a1, a4
 ; RV64I-NEXT:    addi a4, a5, -64
 ; RV64I-NEXT:    sll a1, a3, a5
 ; RV64I-NEXT:    bltz a4, .LBB7_2
@@ -929,25 +929,25 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a6, 9(a0)
 ; RV64I-NEXT:    lbu a7, 8(a0)
 ; RV64I-NEXT:    lbu t0, 10(a0)
+; RV64I-NEXT:    lbu t1, 11(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    lbu t1, 11(a0)
-; RV64I-NEXT:    or a6, t0, a6
 ; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    lbu t2, 14(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
 ; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    sll a0, a0, a5
 ; RV64I-NEXT:    not a5, a5
 ; RV64I-NEXT:    srli a3, a3, 1
@@ -1021,7 +1021,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a0, 15(a0)
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, s1
 ; RV32I-NEXT:    or a1, a1, s0
 ; RV32I-NEXT:    sb zero, 27(sp)
 ; RV32I-NEXT:    sb zero, 26(sp)
@@ -1067,8 +1067,8 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a4, a4, a0
 ; RV32I-NEXT:    andi a5, a1, 7
 ; RV32I-NEXT:    sll a0, a4, a5
 ; RV32I-NEXT:    lbu a1, 1(a3)
@@ -1079,8 +1079,8 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a6, a6, a1
 ; RV32I-NEXT:    srli a1, a6, 1
 ; RV32I-NEXT:    xori a7, a5, 31
 ; RV32I-NEXT:    srl a1, a1, a7
@@ -1093,8 +1093,8 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t0, t0, t1
 ; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t2, t0
-; RV32I-NEXT:    or t0, t3, t0
+; RV32I-NEXT:    or t1, t3, t2
+; RV32I-NEXT:    or t0, t1, t0
 ; RV32I-NEXT:    sll t0, t0, a5
 ; RV32I-NEXT:    lbu t1, 9(a3)
 ; RV32I-NEXT:    lbu t2, 8(a3)
@@ -1104,7 +1104,7 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t3
 ; RV32I-NEXT:    or a3, a3, t1
 ; RV32I-NEXT:    srli t1, a3, 1
 ; RV32I-NEXT:    srl a7, t1, a7
@@ -1167,42 +1167,42 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a3, a3, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    lbu a4, 13(a0)
 ; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
+; RV64I-NEXT:    lbu a6, 14(a0)
+; RV64I-NEXT:    lbu a7, 15(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t0, a4
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a5, a4, 32
 ; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 2(a1)
-; RV64I-NEXT:    or a3, a3, a6
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 2(a1)
+; RV64I-NEXT:    lbu t0, 3(a1)
 ; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a5, a5, a7
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    or a5, a5, a6
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    lbu t1, 6(a1)
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    or a6, a6, t0
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a1, a1, t0
 ; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a5, a1, a7
+; RV64I-NEXT:    or a5, a1, a5
 ; RV64I-NEXT:    addi a6, a5, -64
 ; RV64I-NEXT:    sra a1, a3, a5
 ; RV64I-NEXT:    bltz a6, .LBB8_2
@@ -1215,25 +1215,25 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a6, 0(a0)
 ; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a6
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a4, a6, a4
 ; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    or a4, a7, a4
 ; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t0, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a6, a6, 8
 ; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a6, t1, a6
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    srl a0, a0, a5
 ; RV64I-NEXT:    not a4, a5
 ; RV64I-NEXT:    slli a3, a3, 1
@@ -1306,7 +1306,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a0, 14(a0)
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or s1, s2, s1
+; RV32I-NEXT:    or a1, a1, s2
 ; RV32I-NEXT:    or a1, a1, s1
 ; RV32I-NEXT:    sb a3, 23(sp)
 ; RV32I-NEXT:    sb a0, 22(sp)
@@ -1356,8 +1356,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a4, a6, a0
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a4, a4, a0
 ; RV32I-NEXT:    andi a5, a1, 7
 ; RV32I-NEXT:    srl a0, a4, a5
 ; RV32I-NEXT:    lbu a1, 9(a3)
@@ -1368,8 +1368,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, t0, a1
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a6, a6, a1
 ; RV32I-NEXT:    slli a1, a6, 1
 ; RV32I-NEXT:    not a7, a5
 ; RV32I-NEXT:    sll a1, a1, a7
@@ -1382,8 +1382,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a7, a7, t0
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    or a7, t2, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    srl a7, a7, a5
 ; RV32I-NEXT:    slli a4, a4, 1
 ; RV32I-NEXT:    xori t0, a5, 31
@@ -1398,7 +1398,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli a3, a3, 24
-; RV32I-NEXT:    or t1, t3, t1
+; RV32I-NEXT:    or a3, a3, t3
 ; RV32I-NEXT:    or a3, a3, t1
 ; RV32I-NEXT:    slli t1, a3, 1
 ; RV32I-NEXT:    sll t0, t1, t0
@@ -1490,32 +1490,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 1(a1)
 ; RV64I-NEXT:    lbu s10, 0(a1)
 ; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu ra, 3(a1)
 ; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or s9, s9, s10
 ; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli ra, ra, 24
 ; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    lbu ra, 4(a1)
+; RV64I-NEXT:    or s11, ra, s11
 ; RV64I-NEXT:    or s9, s11, s9
-; RV64I-NEXT:    lbu s11, 6(a1)
+; RV64I-NEXT:    lbu s11, 4(a1)
 ; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    or s10, s10, ra
-; RV64I-NEXT:    lbu ra, 7(a1)
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    or s10, s11, s10
+; RV64I-NEXT:    lbu ra, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    or s10, s10, s11
 ; RV64I-NEXT:    lbu s11, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    or s10, ra, s10
+; RV64I-NEXT:    slli ra, ra, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, ra
 ; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli s10, s10, 32
-; RV64I-NEXT:    or s9, s10, s9
+; RV64I-NEXT:    or a1, a1, s10
 ; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or t0, s9, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or t0, a1, s9
 ; RV64I-NEXT:    lbu s9, 24(a0)
 ; RV64I-NEXT:    lbu a7, 25(a0)
 ; RV64I-NEXT:    lbu a6, 26(a0)
@@ -1546,6 +1546,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s0, 68(sp)
 ; RV64I-NEXT:    sb t6, 67(sp)
 ; RV64I-NEXT:    sb t5, 66(sp)
+; RV64I-NEXT:    sb t4, 65(sp)
 ; RV64I-NEXT:    sb zero, 119(sp)
 ; RV64I-NEXT:    sb zero, 118(sp)
 ; RV64I-NEXT:    sb zero, 117(sp)
@@ -1578,7 +1579,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 90(sp)
 ; RV64I-NEXT:    sb zero, 89(sp)
 ; RV64I-NEXT:    sb zero, 88(sp)
-; RV64I-NEXT:    sb t4, 65(sp)
 ; RV64I-NEXT:    sb t3, 64(sp)
 ; RV64I-NEXT:    sb t2, 63(sp)
 ; RV64I-NEXT:    sb t1, 62(sp)
@@ -1606,20 +1606,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    lbu a1, 13(a3)
 ; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a6, 14(a3)
-; RV64I-NEXT:    lbu a7, 15(a3)
+; RV64I-NEXT:    lbu a5, 14(a3)
+; RV64I-NEXT:    lbu a6, 15(a3)
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a1, a6, a1
-; RV64I-NEXT:    or a1, a7, a1
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a4, a0, a5
+; RV64I-NEXT:    or a4, a1, a0
 ; RV64I-NEXT:    andi a1, t0, 7
 ; RV64I-NEXT:    lbu a0, 17(a3)
 ; RV64I-NEXT:    lbu a5, 16(a3)
@@ -1629,20 +1629,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a0, a5, a0
 ; RV64I-NEXT:    lbu a5, 21(a3)
 ; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu t0, 22(a3)
-; RV64I-NEXT:    lbu t1, 23(a3)
+; RV64I-NEXT:    lbu a7, 22(a3)
+; RV64I-NEXT:    lbu t0, 23(a3)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    or a5, a0, a7
+; RV64I-NEXT:    or a5, a5, a0
 ; RV64I-NEXT:    slli a0, a5, 1
 ; RV64I-NEXT:    not a6, a1
 ; RV64I-NEXT:    sll a0, a0, a6
@@ -1654,45 +1654,45 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 5(a3)
 ; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t2, 6(a3)
-; RV64I-NEXT:    lbu t3, 7(a3)
+; RV64I-NEXT:    lbu t1, 6(a3)
+; RV64I-NEXT:    lbu t2, 7(a3)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a7, t2, a7
-; RV64I-NEXT:    or a7, t3, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 25(a3)
 ; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t2, 26(a3)
-; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 26(a3)
+; RV64I-NEXT:    lbu t2, 27(a3)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    lbu t1, 28(a3)
 ; RV64I-NEXT:    lbu t2, 30(a3)
+; RV64I-NEXT:    lbu a3, 31(a3)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    lbu t3, 31(a3)
 ; RV64I-NEXT:    or t0, t0, t1
 ; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or t0, t2, t0
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, t2
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    lbu a3, 27(a3)
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or a3, a3, t0
 ; RV64I-NEXT:    xori t0, a1, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a3, a3, a7
 ; RV64I-NEXT:    slli a7, a3, 1
 ; RV64I-NEXT:    sll a7, a7, t0
 ; RV64I-NEXT:    srl a4, a4, a1
@@ -1822,17 +1822,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    lbu s8, 1(a1)
 ; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 0(a1)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 0(a1)
 ; RV32I-NEXT:    slli s8, s8, 8
 ; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    or s8, s8, s10
-; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or s8, s8, s11
+; RV32I-NEXT:    lbu s11, 22(a0)
 ; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    or s8, ra, s8
-; RV32I-NEXT:    lbu ra, 23(a0)
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    lbu ra, 23(a0)
 ; RV32I-NEXT:    or t0, a1, s8
 ; RV32I-NEXT:    lbu s8, 24(a0)
 ; RV32I-NEXT:    lbu a7, 25(a0)
@@ -1851,8 +1851,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a7, 53(sp)
 ; RV32I-NEXT:    sb s8, 52(sp)
 ; RV32I-NEXT:    sb ra, 51(sp)
-; RV32I-NEXT:    sb s10, 50(sp)
-; RV32I-NEXT:    sb s11, 49(sp)
+; RV32I-NEXT:    sb s11, 50(sp)
+; RV32I-NEXT:    sb s10, 49(sp)
 ; RV32I-NEXT:    sb s9, 48(sp)
 ; RV32I-NEXT:    sb s7, 47(sp)
 ; RV32I-NEXT:    sb s6, 46(sp)
@@ -1924,8 +1924,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or t4, a5, a0
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or t4, a3, a0
 ; RV32I-NEXT:    andi a3, t0, 7
 ; RV32I-NEXT:    lbu a0, 9(a4)
 ; RV32I-NEXT:    lbu a1, 8(a4)
@@ -1935,8 +1935,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    or a1, a6, a5
+; RV32I-NEXT:    or a6, a1, a0
 ; RV32I-NEXT:    slli a0, a6, 1
 ; RV32I-NEXT:    not t0, a3
 ; RV32I-NEXT:    sll a0, a0, t0
@@ -1948,8 +1948,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or t1, t1, a1
+; RV32I-NEXT:    or a5, t1, a7
+; RV32I-NEXT:    or t1, a5, a1
 ; RV32I-NEXT:    slli a1, t4, 1
 ; RV32I-NEXT:    xori t2, a3, 31
 ; RV32I-NEXT:    sll a1, a1, t2
@@ -1961,8 +1961,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or a5, t3, a5
-; RV32I-NEXT:    or t3, t5, a5
+; RV32I-NEXT:    or a7, t5, t3
+; RV32I-NEXT:    or t3, a7, a5
 ; RV32I-NEXT:    lbu a5, 17(a4)
 ; RV32I-NEXT:    lbu a7, 16(a4)
 ; RV32I-NEXT:    lbu t5, 18(a4)
@@ -1971,8 +1971,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t5, a5
-; RV32I-NEXT:    or a5, t6, a5
+; RV32I-NEXT:    or a7, t6, t5
+; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    sll a7, a7, t0
 ; RV32I-NEXT:    lbu t5, 21(a4)
@@ -1983,8 +1983,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t5, t5, t6
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    or t5, s0, t5
-; RV32I-NEXT:    or t5, s1, t5
 ; RV32I-NEXT:    lbu t6, 25(a4)
 ; RV32I-NEXT:    lbu s0, 24(a4)
 ; RV32I-NEXT:    lbu s1, 26(a4)
@@ -1993,23 +1993,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t6, t6, s0
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s1, t6
-; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    lbu s0, 29(a4)
-; RV32I-NEXT:    slli s1, t6, 1
-; RV32I-NEXT:    lbu s2, 28(a4)
-; RV32I-NEXT:    sll t0, s1, t0
+; RV32I-NEXT:    lbu s1, 28(a4)
+; RV32I-NEXT:    slli s2, t6, 1
+; RV32I-NEXT:    sll t0, s2, t0
 ; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    or s0, s0, s1
 ; RV32I-NEXT:    lbu s1, 30(a4)
-; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    lbu a4, 31(a4)
 ; RV32I-NEXT:    slli s2, t3, 1
 ; RV32I-NEXT:    sll s2, s2, t2
 ; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    lbu a4, 31(a4)
-; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli a4, a4, 24
+; RV32I-NEXT:    or a4, a4, s1
 ; RV32I-NEXT:    slli s1, t5, 1
 ; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    slli a4, a4, 24
 ; RV32I-NEXT:    or a4, a4, s0
 ; RV32I-NEXT:    slli s0, a4, 1
 ; RV32I-NEXT:    sll t2, s0, t2
@@ -2148,32 +2148,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    lbu s8, 20(a0)
 ; RV64I-NEXT:    lbu s9, 1(a1)
 ; RV64I-NEXT:    lbu s10, 0(a1)
 ; RV64I-NEXT:    lbu s11, 2(a1)
-; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu ra, 3(a1)
 ; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or s9, s9, s10
 ; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli ra, ra, 24
 ; RV64I-NEXT:    lbu s10, 5(a1)
-; RV64I-NEXT:    lbu ra, 4(a1)
+; RV64I-NEXT:    or s11, ra, s11
 ; RV64I-NEXT:    or s9, s11, s9
-; RV64I-NEXT:    lbu s11, 6(a1)
+; RV64I-NEXT:    lbu s11, 4(a1)
 ; RV64I-NEXT:    slli s10, s10, 8
-; RV64I-NEXT:    or s10, s10, ra
-; RV64I-NEXT:    lbu ra, 7(a1)
-; RV64I-NEXT:    slli s11, s11, 16
-; RV64I-NEXT:    or s10, s11, s10
+; RV64I-NEXT:    lbu ra, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    or s10, s10, s11
 ; RV64I-NEXT:    lbu s11, 21(a0)
-; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    or s10, ra, s10
+; RV64I-NEXT:    slli ra, ra, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, ra
 ; RV64I-NEXT:    lbu ra, 22(a0)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli s10, s10, 32
-; RV64I-NEXT:    or s9, s10, s9
+; RV64I-NEXT:    or a1, a1, s10
 ; RV64I-NEXT:    lbu s10, 23(a0)
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or t0, s9, a1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or t0, a1, s9
 ; RV64I-NEXT:    lbu s9, 24(a0)
 ; RV64I-NEXT:    lbu a7, 25(a0)
 ; RV64I-NEXT:    lbu a6, 26(a0)
@@ -2205,6 +2205,7 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb t6, 99(sp)
 ; RV64I-NEXT:    sb t5, 98(sp)
 ; RV64I-NEXT:    sb t4, 97(sp)
+; RV64I-NEXT:    sb t3, 96(sp)
 ; RV64I-NEXT:    sb zero, 87(sp)
 ; RV64I-NEXT:    sb zero, 86(sp)
 ; RV64I-NEXT:    sb zero, 85(sp)
@@ -2237,7 +2238,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb zero, 58(sp)
 ; RV64I-NEXT:    sb zero, 57(sp)
 ; RV64I-NEXT:    sb zero, 56(sp)
-; RV64I-NEXT:    sb t3, 96(sp)
 ; RV64I-NEXT:    sb t2, 95(sp)
 ; RV64I-NEXT:    sb t1, 94(sp)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
@@ -2264,161 +2264,161 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    lbu a3, 13(a0)
 ; RV64I-NEXT:    lbu a4, 12(a0)
-; RV64I-NEXT:    lbu a6, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
+; RV64I-NEXT:    lbu a5, 14(a0)
+; RV64I-NEXT:    lbu a6, 15(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    or a3, a7, a3
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    or a3, a1, a5
-; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a1, a5, a1
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu t1, 7(a0)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a3, a3, a1
+; RV64I-NEXT:    andi a1, t0, 7
+; RV64I-NEXT:    lbu a4, 1(a0)
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
 ; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 5(a0)
+; RV64I-NEXT:    lbu a6, 4(a0)
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu t0, 7(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, t1, a4
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    lbu a4, 25(a0)
-; RV64I-NEXT:    lbu a5, 24(a0)
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    slli a5, a5, 32
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    lbu a5, 25(a0)
+; RV64I-NEXT:    lbu a6, 24(a0)
 ; RV64I-NEXT:    lbu a7, 26(a0)
-; RV64I-NEXT:    or a6, a1, a6
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a5
+; RV64I-NEXT:    lbu t0, 27(a0)
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a5, a5, a6
 ; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    lbu a1, 29(a0)
-; RV64I-NEXT:    lbu a5, 27(a0)
-; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 29(a0)
 ; RV64I-NEXT:    lbu a7, 28(a0)
-; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    lbu t1, 30(a0)
-; RV64I-NEXT:    lbu t2, 31(a0)
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a1, t1, a1
-; RV64I-NEXT:    or a1, t2, a1
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    lbu a4, 17(a0)
+; RV64I-NEXT:    lbu t0, 30(a0)
+; RV64I-NEXT:    lbu t1, 31(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    lbu a6, 17(a0)
 ; RV64I-NEXT:    lbu a7, 16(a0)
-; RV64I-NEXT:    lbu t1, 18(a0)
-; RV64I-NEXT:    or a5, a1, a5
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a1, a4, a7
-; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    or a1, t1, a1
-; RV64I-NEXT:    lbu a4, 21(a0)
-; RV64I-NEXT:    lbu a7, 20(a0)
+; RV64I-NEXT:    lbu t0, 18(a0)
+; RV64I-NEXT:    lbu t1, 19(a0)
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    or a6, a6, a7
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t1, t1, 24
+; RV64I-NEXT:    lbu a7, 21(a0)
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    lbu t0, 20(a0)
+; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    lbu t1, 22(a0)
-; RV64I-NEXT:    andi t0, t0, 7
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a7
+; RV64I-NEXT:    lbu a0, 23(a0)
+; RV64I-NEXT:    or a7, a7, t0
+; RV64I-NEXT:    srli t0, a4, 1
 ; RV64I-NEXT:    slli t1, t1, 16
-; RV64I-NEXT:    lbu a7, 23(a0)
-; RV64I-NEXT:    or a4, t1, a4
-; RV64I-NEXT:    srli t1, a6, 1
-; RV64I-NEXT:    lbu t2, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    xori a7, t0, 63
-; RV64I-NEXT:    srl a0, t1, a7
-; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a4, a1, t2
-; RV64I-NEXT:    srli a1, a4, 1
-; RV64I-NEXT:    srl a7, a1, a7
-; RV64I-NEXT:    srli a1, a3, 1
-; RV64I-NEXT:    not t1, t0
-; RV64I-NEXT:    srl t1, a1, t1
-; RV64I-NEXT:    sll a1, a3, t0
-; RV64I-NEXT:    sll a3, a5, t0
-; RV64I-NEXT:    sll a4, a4, t0
-; RV64I-NEXT:    sll a5, a6, t0
-; RV64I-NEXT:    srli a6, a4, 56
-; RV64I-NEXT:    sb a6, 23(a2)
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    sb a6, 22(a2)
-; RV64I-NEXT:    srli a6, a4, 40
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    srli a6, a4, 32
-; RV64I-NEXT:    sb a6, 20(a2)
-; RV64I-NEXT:    srli a6, a4, 24
-; RV64I-NEXT:    sb a6, 19(a2)
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    or a6, a4, t1
-; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a4, 17(a2)
-; RV64I-NEXT:    srli a4, a3, 56
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a3, 48
-; RV64I-NEXT:    sb a4, 30(a2)
-; RV64I-NEXT:    srli a4, a3, 40
-; RV64I-NEXT:    sb a4, 29(a2)
-; RV64I-NEXT:    srli a4, a3, 32
-; RV64I-NEXT:    sb a4, 28(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a4, a3, 16
-; RV64I-NEXT:    sb a4, 26(a2)
-; RV64I-NEXT:    or a4, a3, a7
-; RV64I-NEXT:    srli a3, a3, 8
-; RV64I-NEXT:    sb a3, 25(a2)
-; RV64I-NEXT:    srli a3, a5, 56
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a5, 48
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    srli a3, a5, 40
-; RV64I-NEXT:    sb a3, 5(a2)
-; RV64I-NEXT:    srli a3, a5, 32
-; RV64I-NEXT:    sb a3, 4(a2)
-; RV64I-NEXT:    srli a3, a5, 24
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a3, a5, 16
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a5, 0(a2)
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or t1, a0, t1
+; RV64I-NEXT:    xori t2, a1, 63
+; RV64I-NEXT:    srl a0, t0, t2
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    srli a7, a6, 1
+; RV64I-NEXT:    srl a7, a7, t2
+; RV64I-NEXT:    srli t0, a3, 1
+; RV64I-NEXT:    not t1, a1
+; RV64I-NEXT:    srl t0, t0, t1
+; RV64I-NEXT:    sll a3, a3, a1
+; RV64I-NEXT:    sll a5, a5, a1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    sll a1, a4, a1
+; RV64I-NEXT:    srli a4, a6, 56
+; RV64I-NEXT:    sb a4, 23(a2)
+; RV64I-NEXT:    srli a4, a6, 48
+; RV64I-NEXT:    sb a4, 22(a2)
+; RV64I-NEXT:    srli a4, a6, 40
+; RV64I-NEXT:    sb a4, 21(a2)
+; RV64I-NEXT:    srli a4, a6, 32
+; RV64I-NEXT:    sb a4, 20(a2)
+; RV64I-NEXT:    srli a4, a6, 24
+; RV64I-NEXT:    sb a4, 19(a2)
+; RV64I-NEXT:    srli a4, a6, 16
+; RV64I-NEXT:    sb a4, 18(a2)
+; RV64I-NEXT:    or a4, a6, t0
+; RV64I-NEXT:    srli a6, a6, 8
+; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    sb a6, 31(a2)
+; RV64I-NEXT:    srli a6, a5, 48
+; RV64I-NEXT:    sb a6, 30(a2)
+; RV64I-NEXT:    srli a6, a5, 40
+; RV64I-NEXT:    sb a6, 29(a2)
+; RV64I-NEXT:    srli a6, a5, 32
+; RV64I-NEXT:    sb a6, 28(a2)
+; RV64I-NEXT:    srli a6, a5, 24
+; RV64I-NEXT:    sb a6, 27(a2)
+; RV64I-NEXT:    srli a6, a5, 16
+; RV64I-NEXT:    sb a6, 26(a2)
+; RV64I-NEXT:    or a6, a5, a7
 ; RV64I-NEXT:    srli a5, a5, 8
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 48
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    srli a3, a1, 40
-; RV64I-NEXT:    sb a3, 13(a2)
-; RV64I-NEXT:    srli a3, a1, 32
-; RV64I-NEXT:    sb a3, 12(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sb a5, 25(a2)
+; RV64I-NEXT:    srli a5, a1, 56
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    srli a5, a1, 48
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    srli a5, a1, 40
+; RV64I-NEXT:    sb a5, 5(a2)
+; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    srli a5, a1, 24
+; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    srli a5, a1, 16
+; RV64I-NEXT:    sb a5, 2(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
 ; RV64I-NEXT:    srli a1, a1, 8
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a6, 16(a2)
-; RV64I-NEXT:    sb a4, 24(a2)
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    srli a1, a3, 56
+; RV64I-NEXT:    sb a1, 15(a2)
+; RV64I-NEXT:    srli a1, a3, 48
+; RV64I-NEXT:    sb a1, 14(a2)
+; RV64I-NEXT:    srli a1, a3, 40
+; RV64I-NEXT:    sb a1, 13(a2)
+; RV64I-NEXT:    srli a1, a3, 32
+; RV64I-NEXT:    sb a1, 12(a2)
+; RV64I-NEXT:    srli a1, a3, 24
+; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    srli a1, a3, 16
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    or a0, a3, a0
+; RV64I-NEXT:    srli a3, a3, 8
+; RV64I-NEXT:    sb a3, 9(a2)
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb a6, 24(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    ld ra, 216(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 208(sp) # 8-byte Folded Reload
@@ -2480,17 +2480,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    lbu s8, 1(a1)
 ; RV32I-NEXT:    lbu s9, 20(a0)
-; RV32I-NEXT:    lbu s10, 0(a1)
-; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu s10, 21(a0)
+; RV32I-NEXT:    lbu s11, 0(a1)
 ; RV32I-NEXT:    slli s8, s8, 8
 ; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    or s8, s8, s10
-; RV32I-NEXT:    lbu s10, 22(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or s8, s8, s11
+; RV32I-NEXT:    lbu s11, 22(a0)
 ; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    or s8, ra, s8
-; RV32I-NEXT:    lbu ra, 23(a0)
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    lbu ra, 23(a0)
 ; RV32I-NEXT:    or t0, a1, s8
 ; RV32I-NEXT:    lbu s8, 24(a0)
 ; RV32I-NEXT:    lbu a7, 25(a0)
@@ -2509,8 +2509,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a7, 85(sp)
 ; RV32I-NEXT:    sb s8, 84(sp)
 ; RV32I-NEXT:    sb ra, 83(sp)
-; RV32I-NEXT:    sb s10, 82(sp)
-; RV32I-NEXT:    sb s11, 81(sp)
+; RV32I-NEXT:    sb s11, 82(sp)
+; RV32I-NEXT:    sb s10, 81(sp)
 ; RV32I-NEXT:    sb s9, 80(sp)
 ; RV32I-NEXT:    sb s7, 79(sp)
 ; RV32I-NEXT:    sb s6, 78(sp)
@@ -2582,8 +2582,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or t4, a4, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or t3, a3, a0
 ; RV32I-NEXT:    andi a1, t0, 7
 ; RV32I-NEXT:    lbu a0, 1(a5)
 ; RV32I-NEXT:    lbu a3, 0(a5)
@@ -2593,46 +2593,45 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    or a3, a6, a4
+; RV32I-NEXT:    or a6, a3, a0
 ; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    xori t0, a1, 31
-; RV32I-NEXT:    srl a0, a0, t0
+; RV32I-NEXT:    xori a7, a1, 31
+; RV32I-NEXT:    srl a0, a0, a7
 ; RV32I-NEXT:    lbu a3, 13(a5)
 ; RV32I-NEXT:    lbu a4, 12(a5)
-; RV32I-NEXT:    lbu a7, 14(a5)
+; RV32I-NEXT:    lbu t0, 14(a5)
 ; RV32I-NEXT:    lbu t1, 15(a5)
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    or t1, t1, a3
+; RV32I-NEXT:    or a4, t1, t0
+; RV32I-NEXT:    or t0, a4, a3
 ; RV32I-NEXT:    lbu a3, 9(a5)
 ; RV32I-NEXT:    lbu a4, 8(a5)
-; RV32I-NEXT:    lbu a7, 10(a5)
+; RV32I-NEXT:    lbu t1, 10(a5)
 ; RV32I-NEXT:    lbu t2, 11(a5)
 ; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a3, a7, a3
-; RV32I-NEXT:    or t2, t2, a3
-; RV32I-NEXT:    srli a3, t2, 1
-; RV32I-NEXT:    srl a3, a3, t0
-; RV32I-NEXT:    srli a4, t4, 1
-; RV32I-NEXT:    not t3, a1
-; RV32I-NEXT:    srl a7, a4, t3
-; RV32I-NEXT:    lbu a4, 21(a5)
+; RV32I-NEXT:    or a4, t2, t1
+; RV32I-NEXT:    or t1, a4, a3
+; RV32I-NEXT:    srli a3, t1, 1
+; RV32I-NEXT:    srl a3, a3, a7
+; RV32I-NEXT:    srli a4, t3, 1
+; RV32I-NEXT:    not t2, a1
+; RV32I-NEXT:    lbu t4, 21(a5)
 ; RV32I-NEXT:    lbu t5, 20(a5)
 ; RV32I-NEXT:    lbu t6, 22(a5)
 ; RV32I-NEXT:    lbu s0, 23(a5)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, t5
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t4, t4, t5
 ; RV32I-NEXT:    slli t6, t6, 16
 ; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or a4, t6, a4
-; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or t4, t5, t4
 ; RV32I-NEXT:    lbu t5, 17(a5)
 ; RV32I-NEXT:    lbu t6, 16(a5)
 ; RV32I-NEXT:    lbu s0, 18(a5)
@@ -2641,8 +2640,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t5, t5, t6
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    or t5, s0, t5
-; RV32I-NEXT:    or t5, s1, t5
 ; RV32I-NEXT:    lbu t6, 29(a5)
 ; RV32I-NEXT:    lbu s0, 28(a5)
 ; RV32I-NEXT:    lbu s1, 30(a5)
@@ -2651,30 +2650,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t6, t6, s0
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    lbu s0, 25(a5)
-; RV32I-NEXT:    or t6, s1, t6
-; RV32I-NEXT:    lbu s1, 24(a5)
-; RV32I-NEXT:    or t6, s2, t6
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    lbu s2, 26(a5)
-; RV32I-NEXT:    or s0, s0, s1
-; RV32I-NEXT:    srli s1, t5, 1
-; RV32I-NEXT:    srl s1, s1, t0
-; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    lbu s1, 25(a5)
+; RV32I-NEXT:    lbu s2, 24(a5)
+; RV32I-NEXT:    srl a4, a4, t2
+; RV32I-NEXT:    or t6, s0, t6
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    or s0, s1, s2
+; RV32I-NEXT:    lbu s1, 26(a5)
 ; RV32I-NEXT:    lbu a5, 27(a5)
-; RV32I-NEXT:    or s0, s2, s0
-; RV32I-NEXT:    srli s2, t1, 1
-; RV32I-NEXT:    srl s2, s2, t3
+; RV32I-NEXT:    srli s2, t5, 1
+; RV32I-NEXT:    srl s2, s2, a7
+; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a5, a5, s1
+; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    srl s1, s1, t2
 ; RV32I-NEXT:    or a5, a5, s0
 ; RV32I-NEXT:    srli s0, a5, 1
-; RV32I-NEXT:    srl t0, s0, t0
-; RV32I-NEXT:    srli s0, a4, 1
-; RV32I-NEXT:    srl t3, s0, t3
-; RV32I-NEXT:    sll t4, t4, a1
+; RV32I-NEXT:    srl a7, s0, a7
+; RV32I-NEXT:    srli s0, t4, 1
+; RV32I-NEXT:    srl t2, s0, t2
+; RV32I-NEXT:    sll t3, t3, a1
+; RV32I-NEXT:    sll t0, t0, a1
 ; RV32I-NEXT:    sll t1, t1, a1
-; RV32I-NEXT:    sll t2, t2, a1
-; RV32I-NEXT:    sll a4, a4, a1
+; RV32I-NEXT:    sll t4, t4, a1
 ; RV32I-NEXT:    sll t5, t5, a1
 ; RV32I-NEXT:    sll t6, t6, a1
 ; RV32I-NEXT:    sll a5, a5, a1
@@ -2683,62 +2683,62 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a6, 27(a2)
 ; RV32I-NEXT:    srli a6, a5, 16
 ; RV32I-NEXT:    sb a6, 26(a2)
-; RV32I-NEXT:    or a6, a5, t3
+; RV32I-NEXT:    or a6, a5, t2
 ; RV32I-NEXT:    srli a5, a5, 8
 ; RV32I-NEXT:    sb a5, 25(a2)
 ; RV32I-NEXT:    srli a5, t6, 24
 ; RV32I-NEXT:    sb a5, 31(a2)
 ; RV32I-NEXT:    srli a5, t6, 16
 ; RV32I-NEXT:    sb a5, 30(a2)
-; RV32I-NEXT:    or a5, t6, t0
-; RV32I-NEXT:    srli t0, t6, 8
-; RV32I-NEXT:    sb t0, 29(a2)
-; RV32I-NEXT:    srli t0, t5, 24
-; RV32I-NEXT:    sb t0, 19(a2)
-; RV32I-NEXT:    srli t0, t5, 16
-; RV32I-NEXT:    sb t0, 18(a2)
-; RV32I-NEXT:    or t0, t5, s2
-; RV32I-NEXT:    srli t3, t5, 8
-; RV32I-NEXT:    sb t3, 17(a2)
-; RV32I-NEXT:    srli t3, a4, 24
-; RV32I-NEXT:    sb t3, 23(a2)
-; RV32I-NEXT:    srli t3, a4, 16
-; RV32I-NEXT:    sb t3, 22(a2)
-; RV32I-NEXT:    or s1, a4, s1
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a4, 21(a2)
-; RV32I-NEXT:    srli a4, t2, 24
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, t2, 16
-; RV32I-NEXT:    sb a4, 10(a2)
-; RV32I-NEXT:    or a4, t2, a7
-; RV32I-NEXT:    srli a7, t2, 8
-; RV32I-NEXT:    sb a7, 9(a2)
-; RV32I-NEXT:    srli a7, t1, 24
-; RV32I-NEXT:    sb a7, 15(a2)
-; RV32I-NEXT:    srli a7, t1, 16
-; RV32I-NEXT:    sb a7, 14(a2)
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    srli a7, t1, 8
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    srli a7, a1, 24
-; RV32I-NEXT:    sb a7, 3(a2)
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    or a5, t6, a7
+; RV32I-NEXT:    srli a7, t6, 8
+; RV32I-NEXT:    sb a7, 29(a2)
+; RV32I-NEXT:    srli a7, t5, 24
+; RV32I-NEXT:    sb a7, 19(a2)
+; RV32I-NEXT:    srli a7, t5, 16
+; RV32I-NEXT:    sb a7, 18(a2)
+; RV32I-NEXT:    or a7, t5, s1
+; RV32I-NEXT:    srli t2, t5, 8
+; RV32I-NEXT:    sb t2, 17(a2)
+; RV32I-NEXT:    srli t2, t4, 24
+; RV32I-NEXT:    sb t2, 23(a2)
+; RV32I-NEXT:    srli t2, t4, 16
+; RV32I-NEXT:    sb t2, 22(a2)
+; RV32I-NEXT:    or t2, t4, s2
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb t4, 21(a2)
+; RV32I-NEXT:    srli t4, t1, 24
+; RV32I-NEXT:    sb t4, 11(a2)
+; RV32I-NEXT:    srli t4, t1, 16
+; RV32I-NEXT:    sb t4, 10(a2)
+; RV32I-NEXT:    or a4, t1, a4
+; RV32I-NEXT:    srli t1, t1, 8
+; RV32I-NEXT:    sb t1, 9(a2)
+; RV32I-NEXT:    srli t1, t0, 24
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    srli t1, t0, 16
+; RV32I-NEXT:    sb t1, 14(a2)
+; RV32I-NEXT:    or a3, t0, a3
+; RV32I-NEXT:    srli t0, t0, 8
+; RV32I-NEXT:    sb t0, 13(a2)
+; RV32I-NEXT:    srli t0, a1, 24
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    srli t0, a1, 16
+; RV32I-NEXT:    sb t0, 2(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    srli a1, a1, 8
 ; RV32I-NEXT:    sb a1, 1(a2)
-; RV32I-NEXT:    srli a1, t4, 24
+; RV32I-NEXT:    srli a1, t3, 24
 ; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    srli a1, t4, 16
+; RV32I-NEXT:    srli a1, t3, 16
 ; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    or a0, t4, a0
-; RV32I-NEXT:    srli a1, t4, 8
+; RV32I-NEXT:    or a0, t3, a0
+; RV32I-NEXT:    srli a1, t3, 8
 ; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb a6, 24(a2)
 ; RV32I-NEXT:    sb a5, 28(a2)
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb s1, 20(a2)
+; RV32I-NEXT:    sb a7, 16(a2)
+; RV32I-NEXT:    sb t2, 20(a2)
 ; RV32I-NEXT:    sb a4, 8(a2)
 ; RV32I-NEXT:    sb a3, 12(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
@@ -2806,33 +2806,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s5, 16(a0)
 ; RV64I-NEXT:    lbu s6, 17(a0)
 ; RV64I-NEXT:    lbu s7, 18(a0)
-; RV64I-NEXT:    lbu s8, 1(a1)
-; RV64I-NEXT:    lbu s9, 0(a1)
-; RV64I-NEXT:    lbu s10, 2(a1)
-; RV64I-NEXT:    lbu s11, 19(a0)
-; RV64I-NEXT:    slli s8, s8, 8
-; RV64I-NEXT:    or s8, s8, s9
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    lbu s9, 5(a1)
-; RV64I-NEXT:    lbu ra, 4(a1)
-; RV64I-NEXT:    or s8, s10, s8
-; RV64I-NEXT:    lbu s10, 6(a1)
+; RV64I-NEXT:    lbu s8, 19(a0)
+; RV64I-NEXT:    lbu s9, 1(a1)
+; RV64I-NEXT:    lbu s10, 0(a1)
+; RV64I-NEXT:    lbu s11, 2(a1)
+; RV64I-NEXT:    lbu ra, 3(a1)
 ; RV64I-NEXT:    slli s9, s9, 8
-; RV64I-NEXT:    or s9, s9, ra
-; RV64I-NEXT:    lbu ra, 7(a1)
-; RV64I-NEXT:    slli s10, s10, 16
-; RV64I-NEXT:    or s9, s10, s9
-; RV64I-NEXT:    lbu s10, 20(a0)
+; RV64I-NEXT:    or s9, s9, s10
+; RV64I-NEXT:    slli s11, s11, 16
 ; RV64I-NEXT:    slli ra, ra, 24
-; RV64I-NEXT:    or s9, ra, s9
-; RV64I-NEXT:    lbu ra, 21(a0)
-; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli s9, s9, 32
-; RV64I-NEXT:    or s8, s9, s8
-; RV64I-NEXT:    lbu s9, 22(a0)
+; RV64I-NEXT:    lbu s10, 5(a1)
+; RV64I-NEXT:    or s11, ra, s11
+; RV64I-NEXT:    or s9, s11, s9
+; RV64I-NEXT:    lbu s11, 4(a1)
+; RV64I-NEXT:    slli s10, s10, 8
+; RV64I-NEXT:    lbu ra, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    or s10, s10, s11
+; RV64I-NEXT:    lbu s11, 20(a0)
+; RV64I-NEXT:    slli ra, ra, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or t1, s8, a1
-; RV64I-NEXT:    lbu s8, 23(a0)
+; RV64I-NEXT:    or a1, a1, ra
+; RV64I-NEXT:    lbu ra, 21(a0)
+; RV64I-NEXT:    or a1, a1, s10
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or t1, a1, s9
+; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    lbu a7, 24(a0)
 ; RV64I-NEXT:    lbu a6, 25(a0)
 ; RV64I-NEXT:    lbu a5, 26(a0)
@@ -2847,11 +2847,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a5, 82(sp)
 ; RV64I-NEXT:    sb a6, 81(sp)
 ; RV64I-NEXT:    sb a7, 80(sp)
-; RV64I-NEXT:    sb s8, 79(sp)
-; RV64I-NEXT:    sb s9, 78(sp)
+; RV64I-NEXT:    sb s9, 79(sp)
+; RV64I-NEXT:    sb s10, 78(sp)
 ; RV64I-NEXT:    sb ra, 77(sp)
-; RV64I-NEXT:    sb s10, 76(sp)
-; RV64I-NEXT:    sb s11, 75(sp)
+; RV64I-NEXT:    sb s11, 76(sp)
+; RV64I-NEXT:    sb s8, 75(sp)
 ; RV64I-NEXT:    sb s7, 74(sp)
 ; RV64I-NEXT:    sb s6, 73(sp)
 ; RV64I-NEXT:    sb s5, 72(sp)
@@ -2862,9 +2862,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb s0, 67(sp)
 ; RV64I-NEXT:    sb t6, 66(sp)
 ; RV64I-NEXT:    sb t5, 65(sp)
+; RV64I-NEXT:    sb t4, 64(sp)
 ; RV64I-NEXT:    sb t0, 87(sp)
 ; RV64I-NEXT:    slli t0, t0, 56
-; RV64I-NEXT:    sb t4, 64(sp)
 ; RV64I-NEXT:    sb t3, 63(sp)
 ; RV64I-NEXT:    sb t2, 62(sp)
 ; RV64I-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
@@ -2931,20 +2931,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    or a0, a4, a0
 ; RV64I-NEXT:    lbu a1, 13(a3)
 ; RV64I-NEXT:    lbu a4, 12(a3)
-; RV64I-NEXT:    lbu a6, 14(a3)
-; RV64I-NEXT:    lbu a7, 15(a3)
+; RV64I-NEXT:    lbu a5, 14(a3)
+; RV64I-NEXT:    lbu a6, 15(a3)
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a1, a6, a1
-; RV64I-NEXT:    or a1, a7, a1
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a4, a0, a5
+; RV64I-NEXT:    or a4, a1, a0
 ; RV64I-NEXT:    andi a1, t1, 7
 ; RV64I-NEXT:    lbu a0, 17(a3)
 ; RV64I-NEXT:    lbu a5, 16(a3)
@@ -2954,20 +2954,20 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    slli a6, a6, 16
 ; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a0, a6, a0
+; RV64I-NEXT:    or a5, a7, a6
+; RV64I-NEXT:    or a0, a5, a0
 ; RV64I-NEXT:    lbu a5, 21(a3)
 ; RV64I-NEXT:    lbu a6, 20(a3)
-; RV64I-NEXT:    lbu t0, 22(a3)
-; RV64I-NEXT:    lbu t1, 23(a3)
+; RV64I-NEXT:    lbu a7, 22(a3)
+; RV64I-NEXT:    lbu t0, 23(a3)
 ; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, t0, a7
+; RV64I-NEXT:    or a5, a6, a5
 ; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    or a0, a5, a0
-; RV64I-NEXT:    or a5, a0, a7
+; RV64I-NEXT:    or a5, a5, a0
 ; RV64I-NEXT:    slli a0, a5, 1
 ; RV64I-NEXT:    not a6, a1
 ; RV64I-NEXT:    sll a0, a0, a6
@@ -2979,45 +2979,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    or a6, a6, a7
 ; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 5(a3)
 ; RV64I-NEXT:    lbu t0, 4(a3)
-; RV64I-NEXT:    lbu t2, 6(a3)
-; RV64I-NEXT:    lbu t3, 7(a3)
+; RV64I-NEXT:    lbu t1, 6(a3)
+; RV64I-NEXT:    lbu t2, 7(a3)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or a7, t2, a7
-; RV64I-NEXT:    or a7, t3, a7
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    slli a7, a7, 32
 ; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a7, 25(a3)
 ; RV64I-NEXT:    lbu t0, 24(a3)
-; RV64I-NEXT:    lbu t2, 26(a3)
-; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 26(a3)
+; RV64I-NEXT:    lbu t2, 27(a3)
 ; RV64I-NEXT:    slli a7, a7, 8
 ; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or t0, t2, t1
+; RV64I-NEXT:    or a7, t0, a7
 ; RV64I-NEXT:    lbu t0, 29(a3)
-; RV64I-NEXT:    or a7, t2, a7
 ; RV64I-NEXT:    lbu t1, 28(a3)
 ; RV64I-NEXT:    lbu t2, 30(a3)
+; RV64I-NEXT:    lbu a3, 31(a3)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    lbu t3, 31(a3)
 ; RV64I-NEXT:    or t0, t0, t1
 ; RV64I-NEXT:    slli t2, t2, 16
-; RV64I-NEXT:    or t0, t2, t0
-; RV64I-NEXT:    slli t3, t3, 24
-; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, t2
 ; RV64I-NEXT:    slli t1, a4, 1
-; RV64I-NEXT:    lbu a3, 27(a3)
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    or a3, a3, t0
 ; RV64I-NEXT:    xori t0, a1, 63
 ; RV64I-NEXT:    sll t1, t1, t0
-; RV64I-NEXT:    slli a3, a3, 24
-; RV64I-NEXT:    or a3, a7, a3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a3, a3, a7
 ; RV64I-NEXT:    slli a7, a3, 1
 ; RV64I-NEXT:    sll a7, a7, t0
 ; RV64I-NEXT:    srl a4, a4, a1
@@ -3147,17 +3147,17 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu s8, 18(a0)
 ; RV32I-NEXT:    lbu a4, 1(a1)
 ; RV32I-NEXT:    lbu s9, 19(a0)
-; RV32I-NEXT:    lbu s10, 0(a1)
-; RV32I-NEXT:    lbu s11, 20(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 0(a1)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    lbu ra, 2(a1)
-; RV32I-NEXT:    or a4, a4, s10
-; RV32I-NEXT:    lbu s10, 21(a0)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    or a4, a4, s11
+; RV32I-NEXT:    lbu s11, 21(a0)
 ; RV32I-NEXT:    slli ra, ra, 16
-; RV32I-NEXT:    or a4, ra, a4
-; RV32I-NEXT:    lbu ra, 22(a0)
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, ra
+; RV32I-NEXT:    lbu ra, 22(a0)
 ; RV32I-NEXT:    or t1, a1, a4
 ; RV32I-NEXT:    lbu t0, 23(a0)
 ; RV32I-NEXT:    lbu a7, 24(a0)
@@ -3176,8 +3176,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sb a7, 52(sp)
 ; RV32I-NEXT:    sb t0, 51(sp)
 ; RV32I-NEXT:    sb ra, 50(sp)
-; RV32I-NEXT:    sb s10, 49(sp)
-; RV32I-NEXT:    sb s11, 48(sp)
+; RV32I-NEXT:    sb s11, 49(sp)
+; RV32I-NEXT:    sb s10, 48(sp)
 ; RV32I-NEXT:    sb s9, 47(sp)
 ; RV32I-NEXT:    sb s8, 46(sp)
 ; RV32I-NEXT:    sb s7, 45(sp)
@@ -3254,8 +3254,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a5, a5, 24
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    or t4, a5, a0
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    or t4, a4, a0
 ; RV32I-NEXT:    andi a4, t1, 7
 ; RV32I-NEXT:    lbu a0, 9(a3)
 ; RV32I-NEXT:    lbu a1, 8(a3)
@@ -3265,8 +3265,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a0, a5, a0
-; RV32I-NEXT:    or a6, a6, a0
+; RV32I-NEXT:    or a1, a6, a5
+; RV32I-NEXT:    or a6, a1, a0
 ; RV32I-NEXT:    slli a0, a6, 1
 ; RV32I-NEXT:    not t0, a4
 ; RV32I-NEXT:    sll a0, a0, t0
@@ -3278,8 +3278,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a1, a1, a5
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or t1, t1, a1
+; RV32I-NEXT:    or a5, t1, a7
+; RV32I-NEXT:    or t1, a5, a1
 ; RV32I-NEXT:    slli a1, t4, 1
 ; RV32I-NEXT:    xori t2, a4, 31
 ; RV32I-NEXT:    sll a1, a1, t2
@@ -3291,8 +3291,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t5, t5, 24
-; RV32I-NEXT:    or a5, t3, a5
-; RV32I-NEXT:    or t3, t5, a5
+; RV32I-NEXT:    or a7, t5, t3
+; RV32I-NEXT:    or t3, a7, a5
 ; RV32I-NEXT:    lbu a5, 17(a3)
 ; RV32I-NEXT:    lbu a7, 16(a3)
 ; RV32I-NEXT:    lbu t5, 18(a3)
@@ -3301,8 +3301,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or a5, a5, a7
 ; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    or a5, t5, a5
-; RV32I-NEXT:    or a5, t6, a5
+; RV32I-NEXT:    or a7, t6, t5
+; RV32I-NEXT:    or a5, a7, a5
 ; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    sll a7, a7, t0
 ; RV32I-NEXT:    lbu t5, 21(a3)
@@ -3313,8 +3313,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t5, t5, t6
 ; RV32I-NEXT:    slli s0, s0, 16
 ; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    or s0, s1, s0
 ; RV32I-NEXT:    or t5, s0, t5
-; RV32I-NEXT:    or t5, s1, t5
 ; RV32I-NEXT:    lbu t6, 25(a3)
 ; RV32I-NEXT:    lbu s0, 24(a3)
 ; RV32I-NEXT:    lbu s1, 26(a3)
@@ -3323,23 +3323,23 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    or t6, t6, s0
 ; RV32I-NEXT:    slli s1, s1, 16
 ; RV32I-NEXT:    slli s2, s2, 24
-; RV32I-NEXT:    or t6, s1, t6
-; RV32I-NEXT:    or t6, s2, t6
+; RV32I-NEXT:    or s0, s2, s1
+; RV32I-NEXT:    or t6, s0, t6
 ; RV32I-NEXT:    lbu s0, 29(a3)
-; RV32I-NEXT:    slli s1, t6, 1
-; RV32I-NEXT:    lbu s2, 28(a3)
-; RV32I-NEXT:    sll t0, s1, t0
+; RV32I-NEXT:    lbu s1, 28(a3)
+; RV32I-NEXT:    slli s2, t6, 1
+; RV32I-NEXT:    sll t0, s2, t0
 ; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    or s0, s0, s1
 ; RV32I-NEXT:    lbu s1, 30(a3)
-; RV32I-NEXT:    or s0, s0, s2
+; RV32I-NEXT:    lbu a3, 31(a3)
 ; RV32I-NEXT:    slli s2, t3, 1
 ; RV32I-NEXT:    sll s2, s2, t2
 ; RV32I-NEXT:    slli s1, s1, 16
-; RV32I-NEXT:    lbu a3, 31(a3)
-; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a3, a3, s1
 ; RV32I-NEXT:    slli s1, t5, 1
 ; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    slli a3, a3, 24
 ; RV32I-NEXT:    or a3, a3, s0
 ; RV32I-NEXT:    slli s0, a3, 1
 ; RV32I-NEXT:    sll t2, s0, t2

diff  --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 362229bfbf9dc..6c474794a29c8 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -187,10 +187,10 @@ entry:
 define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: saddo1.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a5, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a5, a1, a0
+; RV32-NEXT:    add a5, a5, a0
 ; RV32-NEXT:    xor a0, a1, a5
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    not a1, a1
@@ -211,10 +211,10 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: saddo1.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a5, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    add a5, a1, a0
+; RV32ZBA-NEXT:    add a5, a5, a0
 ; RV32ZBA-NEXT:    xor a0, a1, a5
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    not a1, a1
@@ -449,10 +449,10 @@ entry:
 define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: uaddo.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a3, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    add a3, a3, a0
-; RV32-NEXT:    add a3, a1, a3
 ; RV32-NEXT:    beq a3, a1, .LBB10_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a3, a1
@@ -470,10 +470,10 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ;
 ; RV32ZBA-LABEL: uaddo.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a3, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
 ; RV32ZBA-NEXT:    add a3, a3, a0
-; RV32ZBA-NEXT:    add a3, a1, a3
 ; RV32ZBA-NEXT:    beq a3, a1, .LBB10_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a3, a1
@@ -634,8 +634,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: ssubo.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a5, a0, a2
-; RV32-NEXT:    add a5, a3, a5
-; RV32-NEXT:    sub a5, a1, a5
+; RV32-NEXT:    sub a6, a1, a3
+; RV32-NEXT:    sub a5, a6, a5
 ; RV32-NEXT:    xor a6, a1, a5
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    and a1, a1, a6
@@ -658,8 +658,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-LABEL: ssubo.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a5, a0, a2
-; RV32ZBA-NEXT:    add a5, a3, a5
-; RV32ZBA-NEXT:    sub a5, a1, a5
+; RV32ZBA-NEXT:    sub a6, a1, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
 ; RV32ZBA-NEXT:    xor a6, a1, a5
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    and a1, a1, a6
@@ -806,8 +806,8 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: usubo.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a5, a0, a2
-; RV32-NEXT:    add a3, a3, a5
 ; RV32-NEXT:    sub a3, a1, a3
+; RV32-NEXT:    sub a3, a3, a5
 ; RV32-NEXT:    sub a2, a0, a2
 ; RV32-NEXT:    beq a3, a1, .LBB18_2
 ; RV32-NEXT:  # %bb.1: # %entry
@@ -830,8 +830,8 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-LABEL: usubo.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a5, a0, a2
-; RV32ZBA-NEXT:    add a3, a3, a5
 ; RV32ZBA-NEXT:    sub a3, a1, a3
+; RV32ZBA-NEXT:    sub a3, a3, a5
 ; RV32ZBA-NEXT:    sub a2, a0, a2
 ; RV32ZBA-NEXT:    beq a3, a1, .LBB18_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
@@ -987,21 +987,21 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    sltu t0, t1, t0
 ; RV32-NEXT:    sltu a6, a7, a6
 ; RV32-NEXT:    mulhu a7, a1, a3
+; RV32-NEXT:    add a6, a7, a6
 ; RV32-NEXT:    add a6, a6, t0
-; RV32-NEXT:    mulhu t0, a2, t2
+; RV32-NEXT:    mulhu a7, a2, t2
+; RV32-NEXT:    add a7, a7, t3
 ; RV32-NEXT:    mul a3, a3, t2
-; RV32-NEXT:    add a3, t3, a3
-; RV32-NEXT:    add a3, t0, a3
+; RV32-NEXT:    add a3, a7, a3
 ; RV32-NEXT:    mul a1, t4, a1
-; RV32-NEXT:    mulhu t0, t4, a0
+; RV32-NEXT:    mulhu a7, t4, a0
+; RV32-NEXT:    add a1, a7, a1
 ; RV32-NEXT:    add a1, a1, t5
 ; RV32-NEXT:    add a1, a1, a3
 ; RV32-NEXT:    sltu a3, t6, t5
 ; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    add a1, t0, a1
 ; RV32-NEXT:    add a1, a6, a1
 ; RV32-NEXT:    add a1, a1, s1
-; RV32-NEXT:    add a1, a7, a1
 ; RV32-NEXT:    srai a3, a5, 31
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    xor a3, s0, a3
@@ -1058,21 +1058,21 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    sltu t0, t1, t0
 ; RV32ZBA-NEXT:    sltu a6, a7, a6
 ; RV32ZBA-NEXT:    mulhu a7, a1, a3
+; RV32ZBA-NEXT:    add a6, a7, a6
 ; RV32ZBA-NEXT:    add a6, a6, t0
-; RV32ZBA-NEXT:    mulhu t0, a2, t2
+; RV32ZBA-NEXT:    mulhu a7, a2, t2
+; RV32ZBA-NEXT:    add a7, a7, t3
 ; RV32ZBA-NEXT:    mul a3, a3, t2
-; RV32ZBA-NEXT:    add a3, t3, a3
-; RV32ZBA-NEXT:    add a3, t0, a3
+; RV32ZBA-NEXT:    add a3, a7, a3
 ; RV32ZBA-NEXT:    mul a1, t4, a1
-; RV32ZBA-NEXT:    mulhu t0, t4, a0
+; RV32ZBA-NEXT:    mulhu a7, t4, a0
+; RV32ZBA-NEXT:    add a1, a7, a1
 ; RV32ZBA-NEXT:    add a1, a1, t5
 ; RV32ZBA-NEXT:    add a1, a1, a3
 ; RV32ZBA-NEXT:    sltu a3, t6, t5
 ; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    add a1, t0, a1
 ; RV32ZBA-NEXT:    add a1, a6, a1
 ; RV32ZBA-NEXT:    add a1, a1, s1
-; RV32ZBA-NEXT:    add a1, a7, a1
 ; RV32ZBA-NEXT:    srai a3, a5, 31
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    xor a3, s0, a3
@@ -1335,20 +1335,20 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a5, a3, a0
 ; RV32-NEXT:    mul a6, a1, a2
-; RV32-NEXT:    mulhu a7, a0, a2
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sltu a6, a5, a7
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a6, a5, a6
 ; RV32-NEXT:    snez a7, a3
 ; RV32-NEXT:    snez t0, a1
 ; RV32-NEXT:    and a7, t0, a7
 ; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    or a1, a7, a1
 ; RV32-NEXT:    mulhu a3, a3, a0
 ; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    or a1, a1, a3
 ; RV32-NEXT:    or a1, a1, a6
-; RV32-NEXT:    or a1, a7, a1
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    sw a0, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
@@ -1368,20 +1368,20 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a5, a3, a0
 ; RV32ZBA-NEXT:    mul a6, a1, a2
-; RV32ZBA-NEXT:    mulhu a7, a0, a2
-; RV32ZBA-NEXT:    add a5, a7, a5
-; RV32ZBA-NEXT:    add a5, a5, a6
-; RV32ZBA-NEXT:    sltu a6, a5, a7
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a6, a5, a6
 ; RV32ZBA-NEXT:    snez a7, a3
 ; RV32ZBA-NEXT:    snez t0, a1
 ; RV32ZBA-NEXT:    and a7, t0, a7
 ; RV32ZBA-NEXT:    mulhu a1, a1, a2
 ; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    or a1, a7, a1
 ; RV32ZBA-NEXT:    mulhu a3, a3, a0
 ; RV32ZBA-NEXT:    snez a3, a3
 ; RV32ZBA-NEXT:    or a1, a1, a3
 ; RV32ZBA-NEXT:    or a1, a1, a6
-; RV32ZBA-NEXT:    or a1, a7, a1
 ; RV32ZBA-NEXT:    mul a0, a0, a2
 ; RV32ZBA-NEXT:    sw a0, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
@@ -1561,10 +1561,10 @@ entry:
 define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: saddo.select.i64:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a4, a0, a2
-; RV32-NEXT:    sltu a4, a4, a0
-; RV32-NEXT:    add a4, a3, a4
-; RV32-NEXT:    add a4, a1, a4
+; RV32-NEXT:    add a4, a1, a3
+; RV32-NEXT:    add a5, a0, a2
+; RV32-NEXT:    sltu a5, a5, a0
+; RV32-NEXT:    add a4, a4, a5
 ; RV32-NEXT:    xor a4, a1, a4
 ; RV32-NEXT:    xor a5, a1, a3
 ; RV32-NEXT:    not a5, a5
@@ -1589,10 +1589,10 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: saddo.select.i64:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a4, a0, a2
-; RV32ZBA-NEXT:    sltu a4, a4, a0
-; RV32ZBA-NEXT:    add a4, a3, a4
-; RV32ZBA-NEXT:    add a4, a1, a4
+; RV32ZBA-NEXT:    add a4, a1, a3
+; RV32ZBA-NEXT:    add a5, a0, a2
+; RV32ZBA-NEXT:    sltu a5, a5, a0
+; RV32ZBA-NEXT:    add a4, a4, a5
 ; RV32ZBA-NEXT:    xor a4, a1, a4
 ; RV32ZBA-NEXT:    xor a5, a1, a3
 ; RV32ZBA-NEXT:    not a5, a5
@@ -1624,10 +1624,10 @@ entry:
 define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: saddo.not.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a4, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    not a1, a1
@@ -1647,10 +1647,10 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: saddo.not.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a4, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
 ; RV32ZBA-NEXT:    xor a0, a1, a0
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    not a1, a1
@@ -1755,10 +1755,10 @@ entry:
 define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: uaddo.select.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a5, a1, a3
 ; RV32-NEXT:    add a4, a0, a2
 ; RV32-NEXT:    sltu a4, a4, a0
-; RV32-NEXT:    add a5, a3, a4
-; RV32-NEXT:    add a5, a1, a5
+; RV32-NEXT:    add a5, a5, a4
 ; RV32-NEXT:    bne a5, a1, .LBB34_3
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    beqz a4, .LBB34_4
@@ -1783,10 +1783,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: uaddo.select.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a5, a1, a3
 ; RV32ZBA-NEXT:    add a4, a0, a2
 ; RV32ZBA-NEXT:    sltu a4, a4, a0
-; RV32ZBA-NEXT:    add a5, a3, a4
-; RV32ZBA-NEXT:    add a5, a1, a5
+; RV32ZBA-NEXT:    add a5, a5, a4
 ; RV32ZBA-NEXT:    bne a5, a1, .LBB34_3
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    beqz a4, .LBB34_4
@@ -1818,10 +1818,10 @@ entry:
 define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: uaddo.not.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a3, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    add a2, a3, a0
-; RV32-NEXT:    add a2, a1, a2
 ; RV32-NEXT:    beq a2, a1, .LBB35_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a2, a1
@@ -1838,10 +1838,10 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: uaddo.not.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a3, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
 ; RV32ZBA-NEXT:    add a2, a3, a0
-; RV32ZBA-NEXT:    add a2, a1, a2
 ; RV32ZBA-NEXT:    beq a2, a1, .LBB35_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a2, a1
@@ -1956,11 +1956,11 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: ssubo.select.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a4, a0, a2
-; RV32-NEXT:    add a4, a3, a4
-; RV32-NEXT:    sub a4, a1, a4
-; RV32-NEXT:    xor a4, a1, a4
-; RV32-NEXT:    xor a5, a1, a3
-; RV32-NEXT:    and a4, a5, a4
+; RV32-NEXT:    sub a5, a1, a3
+; RV32-NEXT:    sub a5, a5, a4
+; RV32-NEXT:    xor a5, a1, a5
+; RV32-NEXT:    xor a4, a1, a3
+; RV32-NEXT:    and a4, a4, a5
 ; RV32-NEXT:    bltz a4, .LBB38_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a2
@@ -1982,11 +1982,11 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: ssubo.select.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a4, a0, a2
-; RV32ZBA-NEXT:    add a4, a3, a4
-; RV32ZBA-NEXT:    sub a4, a1, a4
-; RV32ZBA-NEXT:    xor a4, a1, a4
-; RV32ZBA-NEXT:    xor a5, a1, a3
-; RV32ZBA-NEXT:    and a4, a5, a4
+; RV32ZBA-NEXT:    sub a5, a1, a3
+; RV32ZBA-NEXT:    sub a5, a5, a4
+; RV32ZBA-NEXT:    xor a5, a1, a5
+; RV32ZBA-NEXT:    xor a4, a1, a3
+; RV32ZBA-NEXT:    and a4, a4, a5
 ; RV32ZBA-NEXT:    bltz a4, .LBB38_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a2
@@ -2015,12 +2015,12 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: ssub.not.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    sub a0, a1, a0
-; RV32-NEXT:    xor a0, a1, a0
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sub a2, a2, a0
+; RV32-NEXT:    xor a2, a1, a2
 ; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    slti a0, a1, 0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2036,12 +2036,12 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: ssub.not.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    sub a0, a1, a0
-; RV32ZBA-NEXT:    xor a0, a1, a0
+; RV32ZBA-NEXT:    sub a2, a1, a3
+; RV32ZBA-NEXT:    sub a2, a2, a0
+; RV32ZBA-NEXT:    xor a2, a1, a2
 ; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    slti a0, a0, 0
+; RV32ZBA-NEXT:    and a1, a1, a2
+; RV32ZBA-NEXT:    slti a0, a1, 0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2142,8 +2142,8 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: usubo.select.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a4, a0, a2
-; RV32-NEXT:    add a4, a3, a4
-; RV32-NEXT:    sub a4, a1, a4
+; RV32-NEXT:    sub a5, a1, a3
+; RV32-NEXT:    sub a4, a5, a4
 ; RV32-NEXT:    beq a4, a1, .LBB42_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a4, a1, a4
@@ -2171,8 +2171,8 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: usubo.select.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a4, a0, a2
-; RV32ZBA-NEXT:    add a4, a3, a4
-; RV32ZBA-NEXT:    sub a4, a1, a4
+; RV32ZBA-NEXT:    sub a5, a1, a3
+; RV32ZBA-NEXT:    sub a4, a5, a4
 ; RV32ZBA-NEXT:    beq a4, a1, .LBB42_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a4, a1, a4
@@ -2207,8 +2207,8 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: usubo.not.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a4, a0, a2
-; RV32-NEXT:    add a3, a3, a4
 ; RV32-NEXT:    sub a3, a1, a3
+; RV32-NEXT:    sub a3, a3, a4
 ; RV32-NEXT:    beq a3, a1, .LBB43_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a1, a3
@@ -2230,8 +2230,8 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: usubo.not.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a4, a0, a2
-; RV32ZBA-NEXT:    add a3, a3, a4
 ; RV32ZBA-NEXT:    sub a3, a1, a3
+; RV32ZBA-NEXT:    sub a3, a3, a4
 ; RV32ZBA-NEXT:    beq a3, a1, .LBB43_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a1, a3
@@ -2377,21 +2377,21 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
+; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    mulhu a7, a2, t1
-; RV32-NEXT:    mul t0, a3, t1
-; RV32-NEXT:    add t0, t2, t0
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    mul t0, t3, a1
-; RV32-NEXT:    mulhu t1, t3, a0
-; RV32-NEXT:    add t0, t0, t4
+; RV32-NEXT:    mulhu a6, a2, t1
+; RV32-NEXT:    add a6, a6, t2
+; RV32-NEXT:    mul a7, a3, t1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mul a7, t3, a1
+; RV32-NEXT:    mulhu t0, t3, a0
 ; RV32-NEXT:    add a7, t0, a7
-; RV32-NEXT:    sltu t0, t5, t4
-; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, t1, a7
-; RV32-NEXT:    add a5, a5, a7
+; RV32-NEXT:    add a7, a7, t4
+; RV32-NEXT:    add a6, a7, a6
+; RV32-NEXT:    sltu a7, t5, t4
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add a5, a5, a6
 ; RV32-NEXT:    add a5, a5, s0
-; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    srai a4, a4, 31
 ; RV32-NEXT:    xor a5, a5, a4
 ; RV32-NEXT:    xor a4, t6, a4
@@ -2446,21 +2446,21 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
-; RV32ZBA-NEXT:    mulhu a7, a2, t1
-; RV32ZBA-NEXT:    mul t0, a3, t1
-; RV32ZBA-NEXT:    add t0, t2, t0
-; RV32ZBA-NEXT:    add a7, a7, t0
-; RV32ZBA-NEXT:    mul t0, t3, a1
-; RV32ZBA-NEXT:    mulhu t1, t3, a0
-; RV32ZBA-NEXT:    add t0, t0, t4
+; RV32ZBA-NEXT:    mulhu a6, a2, t1
+; RV32ZBA-NEXT:    add a6, a6, t2
+; RV32ZBA-NEXT:    mul a7, a3, t1
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mul a7, t3, a1
+; RV32ZBA-NEXT:    mulhu t0, t3, a0
 ; RV32ZBA-NEXT:    add a7, t0, a7
-; RV32ZBA-NEXT:    sltu t0, t5, t4
-; RV32ZBA-NEXT:    add a7, a7, t0
-; RV32ZBA-NEXT:    add a7, t1, a7
-; RV32ZBA-NEXT:    add a5, a5, a7
+; RV32ZBA-NEXT:    add a7, a7, t4
+; RV32ZBA-NEXT:    add a6, a7, a6
+; RV32ZBA-NEXT:    sltu a7, t5, t4
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    add a5, a5, a6
 ; RV32ZBA-NEXT:    add a5, a5, s0
-; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    srai a4, a4, 31
 ; RV32ZBA-NEXT:    xor a5, a5, a4
 ; RV32ZBA-NEXT:    xor a4, t6, a4
@@ -2522,21 +2522,21 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
+; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
+; RV32-NEXT:    add a2, a2, t2
 ; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a3, t2, a3
 ; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    add a1, a1, t4
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    sltu a2, t5, t4
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    sltu a1, t5, t4
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
 ; RV32-NEXT:    add a0, a0, s0
-; RV32-NEXT:    add a0, a6, a0
 ; RV32-NEXT:    srai a4, a4, 31
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a1, t6, a4
@@ -2585,21 +2585,21 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
+; RV32ZBA-NEXT:    add a2, a2, t2
 ; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a3, t2, a3
 ; RV32ZBA-NEXT:    add a2, a2, a3
 ; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    add a1, a1, t4
-; RV32ZBA-NEXT:    add a1, a1, a2
-; RV32ZBA-NEXT:    sltu a2, t5, t4
-; RV32ZBA-NEXT:    add a1, a1, a2
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, t4
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    sltu a1, t5, t4
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
 ; RV32ZBA-NEXT:    add a0, a0, s0
-; RV32ZBA-NEXT:    add a0, a6, a0
 ; RV32ZBA-NEXT:    srai a4, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a1, t6, a4
@@ -2715,19 +2715,19 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    sltu a4, a4, a5
 ; RV32-NEXT:    snez a5, a3
 ; RV32-NEXT:    snez a6, a1
 ; RV32-NEXT:    and a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a2
 ; RV32-NEXT:    snez a6, a6
-; RV32-NEXT:    mulhu a7, a3, a0
-; RV32-NEXT:    snez a7, a7
-; RV32-NEXT:    or a6, a6, a7
-; RV32-NEXT:    or a4, a6, a4
+; RV32-NEXT:    or a5, a5, a6
+; RV32-NEXT:    mulhu a6, a3, a0
+; RV32-NEXT:    snez a6, a6
+; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    or a4, a5, a4
 ; RV32-NEXT:    bnez a4, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
@@ -2749,19 +2749,19 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    sltu a4, a4, a5
 ; RV32ZBA-NEXT:    snez a5, a3
 ; RV32ZBA-NEXT:    snez a6, a1
 ; RV32ZBA-NEXT:    and a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a2
 ; RV32ZBA-NEXT:    snez a6, a6
-; RV32ZBA-NEXT:    mulhu a7, a3, a0
-; RV32ZBA-NEXT:    snez a7, a7
-; RV32ZBA-NEXT:    or a6, a6, a7
-; RV32ZBA-NEXT:    or a4, a6, a4
+; RV32ZBA-NEXT:    or a5, a5, a6
+; RV32ZBA-NEXT:    mulhu a6, a3, a0
+; RV32ZBA-NEXT:    snez a6, a6
+; RV32ZBA-NEXT:    or a5, a5, a6
 ; RV32ZBA-NEXT:    or a4, a5, a4
 ; RV32ZBA-NEXT:    bnez a4, .LBB50_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
@@ -2790,20 +2790,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    sltu a4, a4, a5
 ; RV32-NEXT:    snez a5, a3
 ; RV32-NEXT:    snez a6, a1
 ; RV32-NEXT:    and a5, a6, a5
 ; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    or a1, a5, a1
 ; RV32-NEXT:    mulhu a0, a3, a0
 ; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a4
-; RV32-NEXT:    or a0, a5, a0
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -2817,20 +2817,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    sltu a4, a4, a5
 ; RV32ZBA-NEXT:    snez a5, a3
 ; RV32ZBA-NEXT:    snez a6, a1
 ; RV32ZBA-NEXT:    and a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a1, a1, a2
 ; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    or a1, a5, a1
 ; RV32ZBA-NEXT:    mulhu a0, a3, a0
 ; RV32ZBA-NEXT:    snez a0, a0
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a4
-; RV32ZBA-NEXT:    or a0, a5, a0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2916,10 +2916,10 @@ continue:
 define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: saddo.br.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a4, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    xor a1, a1, a3
 ; RV32-NEXT:    not a1, a1
@@ -2947,10 +2947,10 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: saddo.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a4, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
 ; RV32ZBA-NEXT:    xor a0, a1, a0
 ; RV32ZBA-NEXT:    xor a1, a1, a3
 ; RV32ZBA-NEXT:    not a1, a1
@@ -3050,10 +3050,10 @@ continue:
 define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: uaddo.br.i64:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a3, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    add a2, a3, a0
-; RV32-NEXT:    add a2, a1, a2
 ; RV32-NEXT:    beq a2, a1, .LBB55_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a2, a1
@@ -3079,10 +3079,10 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: uaddo.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a3, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
 ; RV32ZBA-NEXT:    add a2, a3, a0
-; RV32ZBA-NEXT:    add a2, a1, a2
 ; RV32ZBA-NEXT:    beq a2, a1, .LBB55_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a2, a1
@@ -3185,12 +3185,12 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: ssubo.br.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    sub a0, a1, a0
-; RV32-NEXT:    xor a0, a1, a0
+; RV32-NEXT:    sub a2, a1, a3
+; RV32-NEXT:    sub a2, a2, a0
+; RV32-NEXT:    xor a2, a1, a2
 ; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    bgez a0, .LBB57_2
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    bgez a1, .LBB57_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -3214,12 +3214,12 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: ssubo.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    sub a0, a1, a0
-; RV32ZBA-NEXT:    xor a0, a1, a0
+; RV32ZBA-NEXT:    sub a2, a1, a3
+; RV32ZBA-NEXT:    sub a2, a2, a0
+; RV32ZBA-NEXT:    xor a2, a1, a2
 ; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    bgez a0, .LBB57_2
+; RV32ZBA-NEXT:    and a1, a1, a2
+; RV32ZBA-NEXT:    bgez a1, .LBB57_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -3313,8 +3313,8 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: usubo.br.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a4, a0, a2
-; RV32-NEXT:    add a3, a3, a4
 ; RV32-NEXT:    sub a3, a1, a3
+; RV32-NEXT:    sub a3, a3, a4
 ; RV32-NEXT:    beq a3, a1, .LBB59_3
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a1, a3
@@ -3344,8 +3344,8 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-LABEL: usubo.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a4, a0, a2
-; RV32ZBA-NEXT:    add a3, a3, a4
 ; RV32ZBA-NEXT:    sub a3, a1, a3
+; RV32ZBA-NEXT:    sub a3, a3, a4
 ; RV32ZBA-NEXT:    beq a3, a1, .LBB59_3
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a1, a3
@@ -3478,21 +3478,21 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
+; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
+; RV32-NEXT:    add a2, a2, t2
 ; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a3, t2, a3
 ; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    add a1, a1, t4
-; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    sltu a2, t5, t4
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    sltu a1, t5, t4
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
 ; RV32-NEXT:    add a0, a0, s0
-; RV32-NEXT:    add a0, a6, a0
 ; RV32-NEXT:    srai a4, a4, 31
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a1, t6, a4
@@ -3551,21 +3551,21 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
+; RV32ZBA-NEXT:    add a2, a2, t2
 ; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a3, t2, a3
 ; RV32ZBA-NEXT:    add a2, a2, a3
 ; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    add a1, a1, t4
-; RV32ZBA-NEXT:    add a1, a1, a2
-; RV32ZBA-NEXT:    sltu a2, t5, t4
-; RV32ZBA-NEXT:    add a1, a1, a2
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, t4
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    sltu a1, t5, t4
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
 ; RV32ZBA-NEXT:    add a0, a0, s0
-; RV32ZBA-NEXT:    add a0, a6, a0
 ; RV32ZBA-NEXT:    srai a4, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a1, t6, a4
@@ -3633,16 +3633,16 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-NEXT:    sltu t1, t1, t6
 ; RV32-NEXT:    sltu a4, a6, a4
 ; RV32-NEXT:    mulhu a6, a1, a7
+; RV32-NEXT:    add a4, a6, a4
 ; RV32-NEXT:    add a4, a4, t1
 ; RV32-NEXT:    sltu a5, t3, a5
 ; RV32-NEXT:    mulh a2, t2, a2
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sub a0, a0, a2
-; RV32-NEXT:    sub a0, a0, a5
 ; RV32-NEXT:    sub a0, t0, a0
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a0, a0, a5
 ; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    add a0, a0, t5
-; RV32-NEXT:    add a0, a6, a0
 ; RV32-NEXT:    srai a3, a3, 31
 ; RV32-NEXT:    xor a0, a0, a3
 ; RV32-NEXT:    xor a1, t4, a3
@@ -3695,16 +3695,16 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32ZBA-NEXT:    sltu t1, t1, t6
 ; RV32ZBA-NEXT:    sltu a4, a6, a4
 ; RV32ZBA-NEXT:    mulhu a6, a1, a7
+; RV32ZBA-NEXT:    add a4, a6, a4
 ; RV32ZBA-NEXT:    add a4, a4, t1
 ; RV32ZBA-NEXT:    sltu a5, t3, a5
 ; RV32ZBA-NEXT:    mulh a2, t2, a2
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    sub a0, a0, a2
-; RV32ZBA-NEXT:    sub a0, a0, a5
 ; RV32ZBA-NEXT:    sub a0, t0, a0
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    add a0, a0, a5
 ; RV32ZBA-NEXT:    add a0, a4, a0
 ; RV32ZBA-NEXT:    add a0, a0, t5
-; RV32ZBA-NEXT:    add a0, a6, a0
 ; RV32ZBA-NEXT:    srai a3, a3, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a3
 ; RV32ZBA-NEXT:    xor a1, t4, a3
@@ -3811,20 +3811,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    sltu a4, a4, a5
 ; RV32-NEXT:    snez a5, a3
 ; RV32-NEXT:    snez a6, a1
 ; RV32-NEXT:    and a5, a6, a5
 ; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    or a1, a5, a1
 ; RV32-NEXT:    mulhu a0, a3, a0
 ; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a4
-; RV32-NEXT:    or a0, a5, a0
 ; RV32-NEXT:    beqz a0, .LBB64_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
@@ -3848,20 +3848,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    sltu a4, a4, a5
 ; RV32ZBA-NEXT:    snez a5, a3
 ; RV32ZBA-NEXT:    snez a6, a1
 ; RV32ZBA-NEXT:    and a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a1, a1, a2
 ; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    or a1, a5, a1
 ; RV32ZBA-NEXT:    mulhu a0, a3, a0
 ; RV32ZBA-NEXT:    snez a0, a0
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a4
-; RV32ZBA-NEXT:    or a0, a5, a0
 ; RV32ZBA-NEXT:    beqz a0, .LBB64_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
@@ -3898,8 +3898,8 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a2, a0, a0
 ; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a2, a1, a0
-; RV32-NEXT:    add a2, a1, a2
+; RV32-NEXT:    add a2, a1, a1
+; RV32-NEXT:    add a2, a2, a0
 ; RV32-NEXT:    beq a2, a1, .LBB65_2
 ; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    sltu a0, a2, a1
@@ -3927,8 +3927,8 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a2, a0, a0
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a2, a1, a0
-; RV32ZBA-NEXT:    add a2, a1, a2
+; RV32ZBA-NEXT:    add a2, a1, a1
+; RV32ZBA-NEXT:    add a2, a2, a0
 ; RV32ZBA-NEXT:    beq a2, a1, .LBB65_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    sltu a0, a2, a1

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index ff6d3ab324d5b..efcdb0cec4c72 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -1498,9 +1498,10 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
 ; SSE2-NEXT:    psubw %xmm0, %xmm2
 ; SSE2-NEXT:    paddw %xmm1, %xmm2
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v8i16:

diff  --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index 30e45a4583624..17eda59660193 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -68,11 +68,11 @@ define i32 @test_i32_add_add_idx0(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X64-LABEL: test_i32_add_add_idx0:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
 ; X64-NEXT:    andl $1, %edx
-; X64-NEXT:    leal (%rdx,%rdi), %eax
-; X64-NEXT:    addl %esi, %eax
+; X64-NEXT:    addl %edx, %eax
 ; X64-NEXT:    retq
   %add = add i32 %y, %x
   %mask = and i32 %z, 1

diff  --git a/llvm/test/CodeGen/X86/alias-static-alloca.ll b/llvm/test/CodeGen/X86/alias-static-alloca.ll
index 5df2906d512a4..f4a9e4b0df576 100644
--- a/llvm/test/CodeGen/X86/alias-static-alloca.ll
+++ b/llvm/test/CodeGen/X86/alias-static-alloca.ll
@@ -7,15 +7,17 @@
 define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $ecx killed $ecx def $rcx
 ; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    leal (%rsi,%rdx), %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    addl %edi, %esi
+; CHECK-NEXT:    leal (%rdx,%rcx), %eax
+; CHECK-NEXT:    addl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %a0 = alloca i32

diff  --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
index d0b3b022e15d7..3cc9ef5852ebd 100644
--- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -59,13 +59,13 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
 define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
 ; CHECK-LABEL: DAGCombineB:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; CHECK-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %t1 = add <8 x i32> %v1, %v2
   %t2 = add <8 x i32> %t1, %v1

diff  --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
index d89ebe0d896b0..470dfdfccfc41 100644
--- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
+++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll
@@ -316,7 +316,7 @@ define void @bcast_unfold_mul_v8i64(i64* %arg) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm0, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vmovdqu64 %zmm0, 8192(%rdi,%rax)
 ; CHECK-NEXT:    addq $64, %rax
 ; CHECK-NEXT:    jne .LBB9_1
@@ -351,7 +351,7 @@ define void @bcast_unfold_mul_v4i64(i64* %arg) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm0
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm1
-; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vmovdqu %ymm0, 8192(%rdi,%rax)
 ; CHECK-NEXT:    addq $32, %rax
 ; CHECK-NEXT:    jne .LBB10_1
@@ -386,7 +386,7 @@ define void @bcast_unfold_mul_v2i64(i64* %arg) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm0
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm1
-; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, 8192(%rdi,%rax)
 ; CHECK-NEXT:    addq $16, %rax
 ; CHECK-NEXT:    jne .LBB11_1

diff  --git a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
index 82aff699cf297..d5d10565f24b8 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
@@ -117,10 +117,10 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %rcx
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -136,10 +136,10 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %rcx
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -155,10 +155,10 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %rcx
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %rax
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -174,10 +174,10 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %rcx
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rdx
+; CHECK-NEXT:    vcvtss2si %xmm0, %rax
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rax
-; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 

diff  --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 58d4e744c8e5b..591256dfeda25 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -934,10 +934,10 @@ declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %ecx
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -953,10 +953,10 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %ecx
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -972,10 +972,10 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %ecx
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -991,10 +991,10 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %ecx
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %edx
+; CHECK-NEXT:    vcvtss2si %xmm0, %eax
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -3220,9 +3220,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovqb %zmm0, %xmm2
 ; X64-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovqb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3232,9 +3232,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -3278,9 +3278,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovsqb %zmm0, %xmm2
 ; X64-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovsqb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3290,9 +3290,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovsqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -3336,9 +3336,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovusqb %zmm0, %xmm2
 ; X64-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovusqb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3348,9 +3348,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovusqb %zmm0, %xmm2
 ; X86-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusqb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -3394,9 +3394,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovqw %zmm0, %xmm2
 ; X64-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3406,9 +3406,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -3452,9 +3452,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovsqw %zmm0, %xmm2
 ; X64-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovsqw %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3464,9 +3464,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovsqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -3510,9 +3510,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovusqw %zmm0, %xmm2
 ; X64-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovusqw %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3522,9 +3522,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vpmovusqw %zmm0, %xmm2
 ; X86-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusqw %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -3786,9 +3786,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovdb %zmm0, %xmm2
 ; X64-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3797,9 +3797,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
@@ -3842,9 +3842,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovsdb %zmm0, %xmm2
 ; X64-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovsdb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3853,9 +3853,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovsdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovsdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
@@ -3898,9 +3898,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovusdb %zmm0, %xmm2
 ; X64-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X64-NEXT:    vpmovusdb %zmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
@@ -3909,9 +3909,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovusdb %zmm0, %xmm2
 ; X86-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
 ; X86-NEXT:    vpmovusdb %zmm0, %xmm0 {%k1} {z}
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
@@ -3954,9 +3954,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovdw %zmm0, %ymm2
 ; X64-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; X64-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X64-NEXT:    vpmovdw %zmm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
@@ -3964,9 +3964,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X86-NEXT:    retl
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -4008,9 +4008,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i1
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovsdw %zmm0, %ymm2
 ; X64-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
+; X64-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X64-NEXT:    vpmovsdw %zmm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
@@ -4018,9 +4018,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i1
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovsdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovsdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X86-NEXT:    retl
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -4062,9 +4062,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i
 ; X64-NEXT:    kmovw %edi, %k1
 ; X64-NEXT:    vpmovusdw %zmm0, %ymm2
 ; X64-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
+; X64-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X64-NEXT:    vpmovusdw %zmm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
@@ -4072,9 +4072,9 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
 ; X86-NEXT:    vpmovusdw %zmm0, %ymm2
 ; X86-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
+; X86-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
 ; X86-NEXT:    vpmovusdw %zmm0, %ymm0 {%k1} {z}
 ; X86-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
-; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
 ; X86-NEXT:    retl
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -4548,9 +4548,9 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
+; X64-NEXT:    orl %ecx, %edx
 ; X64-NEXT:    orl %esi, %eax
 ; X64-NEXT:    orl %edx, %eax
-; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -4569,9 +4569,9 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X86-NEXT:    kmovw %k0, %esi
 ; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -4625,9 +4625,9 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
+; X64-NEXT:    andl %ecx, %edx
 ; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    andl %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -4646,9 +4646,9 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X86-NEXT:    kmovw %k0, %esi
 ; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
+; X86-NEXT:    andl %ecx, %edx
 ; X86-NEXT:    andl %esi, %eax
 ; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    andl %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index f60eec9b9e3ea..7ac36cd49439b 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1239,11 +1239,11 @@ define <64 x i8> @test16(i64 %x) {
 ; X86-NEXT:    kshiftlq $6, %k1, %k1
 ; X86-NEXT:    kshiftlq $59, %k0, %k0
 ; X86-NEXT:    kshiftrq $59, %k0, %k0
+; X86-NEXT:    korq %k1, %k0, %k0
 ; X86-NEXT:    movb $1, %al
-; X86-NEXT:    kmovd %eax, %k2
-; X86-NEXT:    kshiftlq $63, %k2, %k2
-; X86-NEXT:    kshiftrq $58, %k2, %k2
-; X86-NEXT:    korq %k1, %k2, %k1
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlq $63, %k1, %k1
+; X86-NEXT:    kshiftrq $58, %k1, %k1
 ; X86-NEXT:    korq %k0, %k1, %k0
 ; X86-NEXT:    vpmovm2b %k0, %zmm0
 ; X86-NEXT:    retl
@@ -1361,10 +1361,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; X86-NEXT:    kshiftlq $6, %k1, %k1
 ; X86-NEXT:    kshiftlq $59, %k0, %k0
 ; X86-NEXT:    kshiftrq $59, %k0, %k0
-; X86-NEXT:    kmovd %eax, %k2
-; X86-NEXT:    kshiftlq $63, %k2, %k2
-; X86-NEXT:    kshiftrq $58, %k2, %k2
-; X86-NEXT:    korq %k1, %k2, %k1
+; X86-NEXT:    korq %k1, %k0, %k0
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlq $63, %k1, %k1
+; X86-NEXT:    kshiftrq $58, %k1, %k1
 ; X86-NEXT:    korq %k0, %k1, %k0
 ; X86-NEXT:    vpmovm2b %k0, %zmm0
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index 412c30906cb5c..474f6a9e1948e 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -35,29 +35,29 @@ define dso_local x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1,
 ;
 ; WIN64-LABEL: test_argv64i1:
 ; WIN64:       # %bb.0:
-; WIN64-NEXT:    addq %rdx, %rcx
-; WIN64-NEXT:    addq %rdi, %rcx
-; WIN64-NEXT:    addq %rsi, %rcx
-; WIN64-NEXT:    addq %r8, %rcx
+; WIN64-NEXT:    addq %rcx, %rax
+; WIN64-NEXT:    addq %rdx, %rax
+; WIN64-NEXT:    addq %rdi, %rax
+; WIN64-NEXT:    leaq (%rsi,%r8), %rcx
 ; WIN64-NEXT:    addq %r9, %rcx
-; WIN64-NEXT:    addq %r10, %rcx
-; WIN64-NEXT:    addq %r11, %rcx
+; WIN64-NEXT:    addq %rcx, %rax
+; WIN64-NEXT:    leaq (%r10,%r11), %rcx
 ; WIN64-NEXT:    addq %r12, %rcx
 ; WIN64-NEXT:    addq %r14, %rcx
-; WIN64-NEXT:    addq %r15, %rcx
 ; WIN64-NEXT:    addq %rcx, %rax
+; WIN64-NEXT:    addq %r15, %rax
 ; WIN64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argv64i1:
 ; LINUXOSX64:       # %bb.0:
-; LINUXOSX64-NEXT:    addq %rdx, %rcx
-; LINUXOSX64-NEXT:    addq %rdi, %rcx
-; LINUXOSX64-NEXT:    addq %rsi, %rcx
-; LINUXOSX64-NEXT:    addq %r8, %rcx
+; LINUXOSX64-NEXT:    addq %rcx, %rax
+; LINUXOSX64-NEXT:    addq %rdx, %rax
+; LINUXOSX64-NEXT:    addq %rdi, %rax
+; LINUXOSX64-NEXT:    leaq (%rsi,%r8), %rcx
 ; LINUXOSX64-NEXT:    addq %r9, %rcx
-; LINUXOSX64-NEXT:    addq %r12, %rcx
-; LINUXOSX64-NEXT:    addq %r13, %rcx
+; LINUXOSX64-NEXT:    addq %rcx, %rax
+; LINUXOSX64-NEXT:    leaq (%r12,%r13), %rcx
 ; LINUXOSX64-NEXT:    addq %r14, %rcx
 ; LINUXOSX64-NEXT:    addq %r15, %rcx
 ; LINUXOSX64-NEXT:    addq %rcx, %rax

diff  --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index bb762469571d1..0caa8826e75c8 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -939,7 +939,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    subl $12, %esp
 ; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    movl %edx, %ebx
@@ -950,37 +950,36 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32-NEXT:    subl %esi, %ebx
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    subl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl %ebp, %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %edx
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %ebx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    imull %ebx, %eax
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT:    subl %ebp, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl %eax, %ecx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %edx, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    imull %ebp, %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    imull %ebp, %eax
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    addl %eax, %edi
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    imull %eax, %edx
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
@@ -1014,18 +1013,18 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; WIN64-NEXT:    # kill: def $r11d killed $r11d killed $r11
 ; WIN64-NEXT:    subl %r12d, %r11d
 ; WIN64-NEXT:    imull %edx, %r11d
+; WIN64-NEXT:    addl %r9d, %r11d
 ; WIN64-NEXT:    leal (%r14,%r15), %edx
-; WIN64-NEXT:    # kill: def $r14d killed $r14d killed $r14
-; WIN64-NEXT:    subl %r15d, %r14d
-; WIN64-NEXT:    imull %esi, %r14d
-; WIN64-NEXT:    addl %r11d, %r14d
+; WIN64-NEXT:    movl %r14d, %r9d
+; WIN64-NEXT:    subl %r15d, %r9d
+; WIN64-NEXT:    imull %esi, %r9d
+; WIN64-NEXT:    addl %r11d, %r9d
 ; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    imull %r8d, %eax
 ; WIN64-NEXT:    imull %ebx, %r10d
+; WIN64-NEXT:    addl %r10d, %eax
 ; WIN64-NEXT:    imull %edi, %edx
-; WIN64-NEXT:    addl %r10d, %edx
 ; WIN64-NEXT:    addl %edx, %eax
-; WIN64-NEXT:    addl %r14d, %eax
 ; WIN64-NEXT:    addl %r9d, %eax
 ; WIN64-NEXT:    popq %rbx
 ; WIN64-NEXT:    retq
@@ -1055,19 +1054,19 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; LINUXOSX64-NEXT:    leal (%r13,%r14), %r11d
 ; LINUXOSX64-NEXT:    movl %r13d, %r12d
 ; LINUXOSX64-NEXT:    subl %r14d, %r12d
-; LINUXOSX64-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; LINUXOSX64-NEXT:    imull %edx, %r12d
-; LINUXOSX64-NEXT:    movl %r15d, %edx
-; LINUXOSX64-NEXT:    subl %r14d, %edx
-; LINUXOSX64-NEXT:    imull %esi, %edx
-; LINUXOSX64-NEXT:    addl %r12d, %edx
+; LINUXOSX64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX64-NEXT:    addl %r9d, %r12d
+; LINUXOSX64-NEXT:    movl %r15d, %r9d
+; LINUXOSX64-NEXT:    subl %edx, %r9d
+; LINUXOSX64-NEXT:    imull %esi, %r9d
+; LINUXOSX64-NEXT:    addl %r12d, %r9d
 ; LINUXOSX64-NEXT:    addl %ecx, %eax
 ; LINUXOSX64-NEXT:    imull %r8d, %eax
 ; LINUXOSX64-NEXT:    imull %r10d, %r11d
-; LINUXOSX64-NEXT:    addl %r15d, %r14d
-; LINUXOSX64-NEXT:    imull %edi, %r14d
-; LINUXOSX64-NEXT:    addl %r11d, %r14d
-; LINUXOSX64-NEXT:    addl %r14d, %eax
+; LINUXOSX64-NEXT:    addl %r11d, %eax
+; LINUXOSX64-NEXT:    addl %r15d, %edx
+; LINUXOSX64-NEXT:    imull %edi, %edx
 ; LINUXOSX64-NEXT:    addl %edx, %eax
 ; LINUXOSX64-NEXT:    addl %r9d, %eax
 ; LINUXOSX64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index b3148629dd554..fa463d3394d3e 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1905,19 +1905,19 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
-; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
 ; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -1987,23 +1987,23 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
 ; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
 ; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
 ; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2078,19 +2078,19 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
-; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
 ; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2160,23 +2160,23 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
-; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
 ; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
 ; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
 ; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2206,19 +2206,19 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
-; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2228,19 +2228,19 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
-; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2268,23 +2268,23 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
-; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2293,23 +2293,23 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
 ; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
@@ -2339,19 +2339,19 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
-; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2361,19 +2361,19 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
-; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2401,23 +2401,23 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
-; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
 ; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2426,23 +2426,23 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
 ; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
 ; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
 ; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)

diff  --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 2654a300e2052..8ea5023908d26 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -53,8 +53,8 @@ define <32 x half> @test_sqrt_ph_512_fast_estimate_attribute_2(<32 x half> %a0,
 ; CHECK-NEXT:    vmulph %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0
 ; CHECK-NEXT:    vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2
+; CHECK-NEXT:    vmulph %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vmulph %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vmulph %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
   %2 = fdiv fast <32 x half> %a1, %1
@@ -697,9 +697,9 @@ define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i
 ; CHECK-NEXT:    kmovd %k0, %esi
 ; CHECK-NEXT:    vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %cl, %dl
 ; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    andb %dl, %al
-; CHECK-NEXT:    andb %cl, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4)

diff  --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 9cdb47b4a21fa..582d889e148c5 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -2094,10 +2094,10 @@ for.end:                                          ; preds = %for.body.preheader,
 define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
 ; X64-LABEL: pr52561:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [112,112,112,112,112,112,112,112]
-; X64-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
+; X64-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
 ; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vpaddd %ymm4, %ymm3, %ymm2
 ; X64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -2110,11 +2110,11 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-32, %esp
 ; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
-; X86-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [112,112,112,112,112,112,112,112]
-; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
+; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
 ; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X86-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
+; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
 ; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X86-NEXT:    vmovsh %xmm0, %xmm2, %xmm0

diff  --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
index 5d7c468351ac2..17900c9e5466f 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
@@ -7,10 +7,10 @@ define void @test_mscatter_v16f16(ptr %base, <16 x i32> %index, <16 x half> %val
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm3
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vpmovsxdq %ymm2, %zmm2
-; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm4
-; CHECK-NEXT:    vpaddq %zmm2, %zmm4, %zmm2
+; CHECK-NEXT:    vpaddq %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm3
+; CHECK-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    vmovsh %xmm1, (%rax)

diff  --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index e274557534774..fc7c8facb9d5e 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1665,9 +1665,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
 ; X86-NEXT:    vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
@@ -1675,9 +1675,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2]
 ; X64-NEXT:    vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -1719,9 +1719,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
 ; X86-NEXT:    vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
@@ -1729,9 +1729,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2]
 ; X64-NEXT:    vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -1773,9 +1773,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
 ; X86-NEXT:    vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
@@ -1783,9 +1783,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2]
 ; X64-NEXT:    vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -1827,9 +1827,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
 ; X86-NEXT:    vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1838,9 +1838,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2]
 ; X64-NEXT:    vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -1885,9 +1885,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
 ; X86-NEXT:    vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1896,9 +1896,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2]
 ; X64-NEXT:    vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -1943,9 +1943,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
 ; X86-NEXT:    vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1954,9 +1954,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2]
 ; X64-NEXT:    vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
@@ -2001,9 +2001,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
 ; X86-NEXT:    vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
@@ -2011,9 +2011,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2]
 ; X64-NEXT:    vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2055,9 +2055,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
 ; X86-NEXT:    vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
@@ -2065,9 +2065,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2]
 ; X64-NEXT:    vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2109,9 +2109,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
 ; X86-NEXT:    vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
@@ -2119,9 +2119,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2]
 ; X64-NEXT:    vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2163,9 +2163,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
 ; X86-NEXT:    vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2174,9 +2174,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2]
 ; X64-NEXT:    vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2221,9 +2221,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
 ; X86-NEXT:    vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2232,9 +2232,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2]
 ; X64-NEXT:    vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2279,9 +2279,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
 ; X86-NEXT:    vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2290,9 +2290,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2]
 ; X64-NEXT:    vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
@@ -2337,9 +2337,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
 ; X86-NEXT:    vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
@@ -2347,9 +2347,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2]
 ; X64-NEXT:    vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; X64-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X64-NEXT:    vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0]
 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
     %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
@@ -2391,9 +2391,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
 ; X86-NEXT:    vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
@@ -2401,9 +2401,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2]
 ; X64-NEXT:    vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; X64-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X64-NEXT:    vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0]
 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
     %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
@@ -2445,9 +2445,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
 ; X86-NEXT:    vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; X86-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X86-NEXT:    vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X86-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
@@ -2455,9 +2455,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2]
 ; X64-NEXT:    vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; X64-NEXT:    vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
 ; X64-NEXT:    vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0]
 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; X64-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
     %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
@@ -2734,9 +2734,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
 ; X86-NEXT:    vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_128:
@@ -2744,9 +2744,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2]
 ; X64-NEXT:    vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
@@ -2788,9 +2788,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
 ; X86-NEXT:    vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
@@ -2798,9 +2798,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2]
 ; X64-NEXT:    vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
@@ -2842,9 +2842,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
 ; X86-NEXT:    vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
@@ -2852,9 +2852,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2]
 ; X64-NEXT:    vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
@@ -2896,9 +2896,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
 ; X86-NEXT:    vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2907,9 +2907,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2]
 ; X64-NEXT:    vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -2954,9 +2954,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
 ; X86-NEXT:    vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2965,9 +2965,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2]
 ; X64-NEXT:    vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -3012,9 +3012,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
 ; X86-NEXT:    vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; X86-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X86-NEXT:    vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X86-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3023,9 +3023,9 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2]
 ; X64-NEXT:    vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; X64-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9]
 ; X64-NEXT:    vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0]
-; X64-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
@@ -3070,9 +3070,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
 ; X86-NEXT:    vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
@@ -3080,9 +3080,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2]
 ; X64-NEXT:    vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
@@ -3124,9 +3124,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
 ; X86-NEXT:    vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
@@ -3134,9 +3134,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2]
 ; X64-NEXT:    vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
@@ -3178,9 +3178,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
 ; X86-NEXT:    vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
@@ -3188,9 +3188,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2]
 ; X64-NEXT:    vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
@@ -3232,9 +3232,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
 ; X86-NEXT:    vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3243,9 +3243,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2]
 ; X64-NEXT:    vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -3290,9 +3290,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
 ; X86-NEXT:    vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3301,9 +3301,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2]
 ; X64-NEXT:    vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -3348,9 +3348,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
 ; X86-NEXT:    vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; X86-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X86-NEXT:    vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0]
 ; X86-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -3359,9 +3359,9 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2]
 ; X64-NEXT:    vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; X64-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9]
 ; X64-NEXT:    vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0]
 ; X64-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0]
-; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
@@ -4357,9 +4357,9 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
 ; X86-NEXT:    vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X86-NEXT:    vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
-; X86-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X86-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-LABEL: test_x86_vcvtps2ph_128:
@@ -4367,9 +4367,9 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
 ; X64-NEXT:    vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X64-NEXT:    vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
-; X64-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X64-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
   %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
@@ -4388,9 +4388,9 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
 ; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
 ; X86-NEXT:    vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
 ; X86-NEXT:    vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X86-NEXT:    vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
-; X86-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X86-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X86-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -4399,9 +4399,9 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
 ; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
 ; X64-NEXT:    vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
 ; X64-NEXT:    vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3]
 ; X64-NEXT:    vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
-; X64-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3]
-; X64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2]
+; X64-NEXT:    vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)

diff  --git a/llvm/test/CodeGen/X86/bmi-out-of-order.ll b/llvm/test/CodeGen/X86/bmi-out-of-order.ll
index 1ae1233f1ee72..93dd432adb953 100644
--- a/llvm/test/CodeGen/X86/bmi-out-of-order.ll
+++ b/llvm/test/CodeGen/X86/bmi-out-of-order.ll
@@ -68,10 +68,10 @@ define i32 @blsmask_through2(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64-LABEL: blsmask_through2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    xorl %edx, %edi
-; X64-NEXT:    xorl %esi, %edi
 ; X64-NEXT:    leal -1(%rsi), %eax
+; X64-NEXT:    xorl %edx, %edi
 ; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    xorl %esi, %eax
 ; X64-NEXT:    retq
 entry:
   %sub = add nsw i32 %b, -1
@@ -104,11 +104,11 @@ define i64 @blsmask_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ;
 ; X64-LABEL: blsmask_through3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorq %rdx, %rdi
-; X64-NEXT:    xorq %rcx, %rdi
-; X64-NEXT:    xorq %rsi, %rdi
 ; X64-NEXT:    leaq -1(%rsi), %rax
+; X64-NEXT:    xorq %rdx, %rdi
 ; X64-NEXT:    xorq %rdi, %rax
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    retq
 entry:
   %sub = add nsw i64 %b, -1
@@ -159,19 +159,19 @@ define i64 @blsmask_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    addl $-1, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %ebx, %ebp
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    adcl $-1, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %ebp, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edi, %eax
-; X86-NEXT:    xorl %ebp, %esi
+; X86-NEXT:    xorl %ebx, %esi
 ; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    imull %eax, %ebp
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    popl %esi
@@ -264,10 +264,10 @@ define i32 @blsi_through2(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64-LABEL: blsi_through2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    andl %edx, %edi
-; X64-NEXT:    andl %esi, %edi
 ; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %edi
 ; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    retq
 entry:
   %sub = sub i32 0, %b
@@ -299,10 +299,10 @@ define i64 @blsi_through3(i64 %a, i64 %b, i64 %c) nounwind {
 ; X64-LABEL: blsi_through3:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    andq %rdx, %rdi
-; X64-NEXT:    andq %rsi, %rdi
 ; X64-NEXT:    negq %rax
+; X64-NEXT:    andq %rdx, %rdi
 ; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    andq %rsi, %rax
 ; X64-NEXT:    retq
 entry:
   %sub = sub i64 0, %b
@@ -362,9 +362,9 @@ define i64 @blsi_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    andl %eax, %ecx
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    popl %esi
@@ -456,10 +456,10 @@ define i32 @blsr_through2(i32 %a, i32 %b, i32 %c) nounwind {
 ; X64-LABEL: blsr_through2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    andl %edx, %edi
-; X64-NEXT:    andl %esi, %edi
 ; X64-NEXT:    leal -1(%rsi), %eax
+; X64-NEXT:    andl %edx, %edi
 ; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    retq
 entry:
   %sub = add nsw i32 %b, -1
@@ -492,12 +492,12 @@ define i64 @blsr_through3(i64 %a, i64 %b, i64 %c, i64 %d) nounwind {
 ;
 ; X64-LABEL: blsr_through3:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    andq %rdx, %rdi
-; X64-NEXT:    andq %rcx, %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    negq %rax
-; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    andq %rsi, %rcx
+; X64-NEXT:    negq %rsi
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    andq %rcx, %rax
 ; X64-NEXT:    retq
 entry:
   %sub = sub nsw i64 0, %b
@@ -548,19 +548,19 @@ define i64 @blsr_through1_used2(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    addl $-1, %edi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl %ebx, %ebp
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    adcl $-1, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    andl %ebp, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %ebp, %esi
+; X86-NEXT:    andl %ebx, %esi
 ; X86-NEXT:    andl %eax, %ecx
-; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    imull %eax, %ebp
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index 9ad370a8b48e4..f96968d5d50f3 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -249,9 +249,9 @@ define void @PR52039(ptr %pa, ptr %pb) {
 ; AVX1-NEXT:    vpsubd 16(%rdi), %xmm0, %xmm1
 ; AVX1-NEXT:    vpsubd (%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm3
 ; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
 ; AVX1-NEXT:    vmovdqu %xmm3, 16(%rdi)

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index dccabfd7f7f32..70258d752b892 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -178,141 +178,146 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    sarl $31, %ebp
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    xorl %ebp, %edi
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl %ebp, %edi
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    bsrl %edi, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %eax, %edx
-; X86-NEXT:    bsrl %ebx, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl %ebp, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    addl $32, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %eax, %ecx
-; X86-NEXT:    addl $64, %ecx
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    addl $64, %edi
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ebp
-; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ebp
+; X86-NEXT:    cmovnel %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    bsrl %ebp, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    bsrl %ebp, %edi
-; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %ebp, %ebp
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb (%esp), %cl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.8: # %_udiv-special-cases
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    je .LBB4_8
-; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -322,7 +327,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    xorb $127, %al
 ; X86-NEXT:    movb %al, %ch
@@ -330,43 +334,46 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 144(%esp,%edi), %edx
-; X86-NEXT:    movl 148(%esp,%edi), %ebx
+; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movl 144(%esp,%esi), %edx
+; X86-NEXT:    movl 148(%esp,%esi), %ebx
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %ebx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %esi
-; X86-NEXT:    shrl %cl, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esp,%edi), %edx
+; X86-NEXT:    movl 140(%esp,%esi), %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 136(%esp,%esi), %edx
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    jae .LBB4_3
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_9
+; X86-NEXT:  .LBB4_2: # %udiv-preheader
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -374,10 +381,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    movb %dl, %cl
@@ -385,28 +392,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    movl 100(%esp,%edx), %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%edx), %ebp
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl 88(%esp,%eax), %ebx
-; X86-NEXT:    movl 92(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%esp,%ebx), %ebp
+; X86-NEXT:    movl 92(%esp,%ebx), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %ebp, %ebp
-; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    orl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, %ebx
+; X86-NEXT:    shrdl %cl, %ebx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -416,32 +421,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_4: # %udiv-do-while
+; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    shldl $1, %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %edi
 ; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
@@ -449,146 +453,141 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %esi, %esi
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    subl %ecx, %ebp
 ; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    shldl $1, %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:  .LBB4_8: # %udiv-end
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:  .LBB4_9: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    xorl %ecx, %edi
 ; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %esi
+; X86-NEXT:    sbbl %ecx, %edi
 ; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
+; X86-NEXT:    movl %edi, 8(%ecx)
 ; X86-NEXT:    movl %ebx, 12(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -614,8 +613,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    movq %rax, (%rbx)
 ; X64-NEXT:    imulq %rax, %r14
 ; X64-NEXT:    mulq %r15
+; X64-NEXT:    addq %r14, %rdx
 ; X64-NEXT:    imulq %r15, %rcx
-; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
 ; X64-NEXT:    subq %rax, %r13
 ; X64-NEXT:    sbbq %rcx, %r12

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 9212295254e99..76ba2863b5960 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __udivdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -177,109 +177,106 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
+; X86-NEXT:    subl $136, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sete (%esp) # 1-byte Folded Spill
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    bsrl %ebp, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    bsrl %ebp, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %ebp, %ebp
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    cmovnel %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %esi, %ecx
-; X86-NEXT:    addl $64, %ecx
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    testl %ebp, %ebp
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    bsrl %edx, %edx
+; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    bsrl %ebp, %edi
-; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    bsrl %eax, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %ebp, %ebp
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl %esi, %ecx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl %ebp, %edx
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ecx, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    orb (%esp), %dl # 1-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovnel %edi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    cmovnel %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovnel %edi, %ebx
-; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb (%esp), %cl # 1-byte Folded Reload
+; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmovnel %ebx, %edi
+; X86-NEXT:    cmovel {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    je .LBB4_9
 ; X86-NEXT:  # %bb.5: # %udiv-bb1
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -292,47 +289,49 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    xorb $127, %al
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 124(%esp,%eax), %edx
-; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 132(%esp,%eax), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 120(%esp,%eax), %ebp
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl 124(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 116(%esp,%eax), %edi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 120(%esp,%eax), %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edi, %ebp
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %ebx, %edi
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    addl $1, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    jmp .LBB4_9
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -356,16 +355,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 84(%esp,%eax), %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %edi
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %edx, %ebx
-; X86-NEXT:    movl 68(%esp,%eax), %ebp
-; X86-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    movl 72(%esp,%eax), %ebp
+; X86-NEXT:    movl 76(%esp,%eax), %edx
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    notb %cl
@@ -374,7 +372,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shrdl %cl, %edx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -391,30 +390,32 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shldl $1, %ebx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    orl %ebp, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -423,17 +424,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
@@ -442,111 +443,113 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl %ebp, %ebx
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebp, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    shldl $1, %ebp, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    shldl $1, %edi, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    shldl $1, %ebx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %ebp, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %eax, 12(%ecx)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    imull %eax, %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl (%esp), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %ebp, %esi
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    subl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    addl $136, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -571,8 +574,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    movq %rax, (%rbx)
 ; X64-NEXT:    imulq %rax, %r14
 ; X64-NEXT:    mulq %r15
+; X64-NEXT:    addq %r14, %rdx
 ; X64-NEXT:    imulq %r15, %rcx
-; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
 ; X64-NEXT:    subq %rax, %r13
 ; X64-NEXT:    sbbq %rcx, %r12

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 8c2d26c885138..ba150e7ea405e 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -785,9 +785,9 @@ define i64 @udiv_i64_3(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -828,9 +828,9 @@ define i64 @udiv_i64_5(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC
-; X32-NEXT:    imull $-858993459, %edi, %esi # imm = 0xCCCCCCCD
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -872,9 +872,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    imull $-286331154, %ecx, %ecx # imm = 0xEEEEEEEE
-; X32-NEXT:    imull $-286331153, %edi, %esi # imm = 0xEEEEEEEF
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-286331153, %edi, %ecx # imm = 0xEEEEEEEF
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    retl
@@ -916,9 +916,9 @@ define i64 @udiv_i64_17(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0
-; X32-NEXT:    imull $-252645135, %edi, %esi # imm = 0xF0F0F0F1
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -961,9 +961,9 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    imull $-16843010, %ecx, %ecx # imm = 0xFEFEFEFE
-; X32-NEXT:    imull $-16843009, %esi, %esi # imm = 0xFEFEFEFF
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-16843009, %esi, %ecx # imm = 0xFEFEFEFF
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
@@ -1004,9 +1004,9 @@ define i64 @udiv_i64_257(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00
-; X32-NEXT:    imull $-16711935, %edi, %esi # imm = 0xFF00FF01
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-16711935, %edi, %ecx # imm = 0xFF00FF01
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -1140,9 +1140,9 @@ define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index c80e8d105b342..2e13776715e5b 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -482,8 +482,8 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -505,8 +505,8 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -532,8 +532,8 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -555,8 +555,8 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -584,8 +584,8 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -609,8 +609,8 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -638,8 +638,8 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -663,8 +663,8 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -694,8 +694,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -721,8 +721,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -750,8 +750,8 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -775,8 +775,8 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -806,8 +806,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -833,8 +833,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -862,8 +862,8 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -887,8 +887,8 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -916,8 +916,8 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
+; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
-; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -941,8 +941,8 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
+; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
-; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll
index c6c787a4b2d28..597e51d877eb4 100644
--- a/llvm/test/CodeGen/X86/fold-add.ll
+++ b/llvm/test/CodeGen/X86/fold-add.ll
@@ -171,10 +171,10 @@ define dso_local i64 @neg_0x80000001() #0 {
 ;
 ; MPIC-LABEL: neg_0x80000001:
 ; MPIC:       # %bb.0: # %entry
-; MPIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
-; MPIC-NEXT:    movabsq $foo at GOTOFF, %rdx
+; MPIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
+; MPIC-NEXT:    movabsq $foo at GOTOFF, %rcx
+; MPIC-NEXT:    addq %rax, %rcx
 ; MPIC-NEXT:    movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
-; MPIC-NEXT:    addq %rdx, %rax
 ; MPIC-NEXT:    addq %rcx, %rax
 ; MPIC-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 89dd4832d6b43..d2c00df281610 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -149,17 +149,17 @@ define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
 ; NOBMI-LABEL: not_a_masked_merge2:
 ; NOBMI:       # %bb.0:
 ; NOBMI-NEXT:    movl %edi, %eax
+; NOBMI-NEXT:    orl %edi, %esi
 ; NOBMI-NEXT:    notl %eax
 ; NOBMI-NEXT:    andl %edx, %eax
 ; NOBMI-NEXT:    orl %esi, %eax
-; NOBMI-NEXT:    orl %edi, %eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: not_a_masked_merge2:
 ; BMI:       # %bb.0:
+; BMI-NEXT:    orl %edi, %esi
 ; BMI-NEXT:    andnl %edx, %edi, %eax
 ; BMI-NEXT:    orl %esi, %eax
-; BMI-NEXT:    orl %edi, %eax
 ; BMI-NEXT:    retq
   %not_an_and0 = or i32 %a0, %a1
   %not = xor i32 %a0, -1

diff  --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index a08133bb7af48..a8636a3496dc4 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -24,85 +24,87 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    .cfi_offset %esi, -20
 ; CHECK-NEXT:    .cfi_offset %edi, -16
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    movl $-1028477379, %edi # imm = 0xC2B2AE3D
-; CHECK-NEXT:    movl $668265295, %ebx # imm = 0x27D4EB4F
-; CHECK-NEXT:    movl a, %eax
-; CHECK-NEXT:    cmpl $0, (%eax)
+; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
+; CHECK-NEXT:    movl $668265295, %ecx # imm = 0x27D4EB4F
+; CHECK-NEXT:    movl a, %edi
+; CHECK-NEXT:    cmpl $0, (%edi)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    movl 8(%eax), %edi
-; CHECK-NEXT:    movl 12(%eax), %esi
-; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:    shldl $1, %edi, %edx
-; CHECK-NEXT:    orl %esi, %edx
-; CHECK-NEXT:    leal (%edi,%edi), %ecx
-; CHECK-NEXT:    orl %edi, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%eax), %ecx
-; CHECK-NEXT:    movl 20(%eax), %esi
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shldl $2, %ecx, %edi
-; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %esi, %edi
-; CHECK-NEXT:    shldl $31, %ecx, %edi
-; CHECK-NEXT:    shll $2, %ecx
-; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    movl 8(%edi), %esi
+; CHECK-NEXT:    movl 12(%edi), %eax
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shldl $1, %esi, %edx
+; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    leal (%esi,%esi), %eax
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 16(%edi), %ebx
+; CHECK-NEXT:    movl 20(%edi), %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    shldl $2, %ebx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    shldl $31, %eax, %ebx
+; CHECK-NEXT:    shll $2, %eax
+; CHECK-NEXT:    orl %ebx, %eax
 ; CHECK-NEXT:    shrl %esi
 ; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    adcl %edx, %esi
-; CHECK-NEXT:    movl 28(%eax), %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 24(%eax), %eax
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 24(%edi), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT:    imull %eax, %ecx
-; CHECK-NEXT:    mull %ebx
-; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
 ; CHECK-NEXT:    imull %eax, %ebx
-; CHECK-NEXT:    addl %ecx, %ebx
-; CHECK-NEXT:    addl %edx, %ebx
-; CHECK-NEXT:    imull $1336530590, %eax, %ecx # imm = 0x4FA9D69E
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-NEXT:    imull $-2056954758, %edx, %eax # imm = 0x85655C7A
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    addl %ebx, %edx
+; CHECK-NEXT:    movl 28(%edi), %edi
+; CHECK-NEXT:    imull %edi, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    movl $1336530590, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    mull %edx
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    shrdl $3, %ebx, %edi
-; CHECK-NEXT:    sarl $3, %ebx
-; CHECK-NEXT:    orl %ecx, %ebx
-; CHECK-NEXT:    orl %eax, %edi
-; CHECK-NEXT:    imull $326129324, %edi, %eax # imm = 0x137056AC
-; CHECK-NEXT:    imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
+; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:    imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT:    addl %ebx, %edx
+; CHECK-NEXT:    shrdl $3, %ecx, %esi
+; CHECK-NEXT:    sarl $3, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    orl %eax, %esi
 ; CHECK-NEXT:    movl $-66860409, %ebx # imm = 0xFC03CA87
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    mull %ebx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    imull $326129324, %esi, %eax # imm = 0x137056AC
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-NEXT:    movl %edi, b
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    mull %ebx
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    xorl %esi, %ecx
-; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT:    imull $326129324, %edi, %esi # imm = 0x137056AC
+; CHECK-NEXT:    addl %edx, %esi
 ; CHECK-NEXT:    movl %ecx, b+4
-; CHECK-NEXT:    imull $326129324, %eax, %edx # imm = 0x137056AC
 ; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
-; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    movl %eax, b
-; CHECK-NEXT:    mull %ebx
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.else
-; CHECK-NEXT:    xorl b+4, %edi
-; CHECK-NEXT:    xorl b, %ebx
-; CHECK-NEXT:    movl $1419758215, %ecx # imm = 0x549FCA87
-; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    mull %ecx
-; CHECK-NEXT:    imull $93298681, %ebx, %esi # imm = 0x58F9FF9
-; CHECK-NEXT:    imull $1419758215, %edi, %ecx # imm = 0x549FCA87
-; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:    xorl b+4, %ebx
+; CHECK-NEXT:    xorl b, %ecx
+; CHECK-NEXT:    movl $1419758215, %edx # imm = 0x549FCA87
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    mull %edx
+; CHECK-NEXT:    imull $93298681, %ecx, %esi # imm = 0x58F9FF9
+; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    imull $1419758215, %ebx, %ecx # imm = 0x549FCA87
 ; CHECK-NEXT:  .LBB0_3: # %if.end
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    addl $-1028477341, %eax # imm = 0xC2B2AE63
 ; CHECK-NEXT:    adcl $-2048144777, %ecx # imm = 0x85EBCA77
 ; CHECK-NEXT:    movl %eax, b

diff  --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll
index 7ed6ddf267a8e..07d85d260a37a 100644
--- a/llvm/test/CodeGen/X86/h-registers-1.ll
+++ b/llvm/test/CodeGen/X86/h-registers-1.ll
@@ -30,11 +30,11 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
 ; CHECK-NEXT:    addq %rdi, %rsi
 ; CHECK-NEXT:    addq %rbp, %rdx
+; CHECK-NEXT:    addq %rsi, %rdx
 ; CHECK-NEXT:    addq %rbx, %rcx
 ; CHECK-NEXT:    addq %r8, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    addq %rdx, %rax
-; CHECK-NEXT:    addq %rsi, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbp
@@ -63,11 +63,11 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
 ; GNUX32-NEXT:    movzbl {{[0-9]+}}(%esp), %r8d
 ; GNUX32-NEXT:    addq %rdi, %rsi
 ; GNUX32-NEXT:    addq %rbp, %rdx
+; GNUX32-NEXT:    addq %rsi, %rdx
 ; GNUX32-NEXT:    addq %rbx, %rcx
 ; GNUX32-NEXT:    addq %r8, %rax
 ; GNUX32-NEXT:    addq %rcx, %rax
 ; GNUX32-NEXT:    addq %rdx, %rax
-; GNUX32-NEXT:    addq %rsi, %rax
 ; GNUX32-NEXT:    popq %rbx
 ; GNUX32-NEXT:    .cfi_def_cfa_offset 16
 ; GNUX32-NEXT:    popq %rbp

diff  --git a/llvm/test/CodeGen/X86/hipe-cc.ll b/llvm/test/CodeGen/X86/hipe-cc.ll
index d8a78eabaf660..a7bcd17b23275 100644
--- a/llvm/test/CodeGen/X86/hipe-cc.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc.ll
@@ -36,7 +36,7 @@ entry:
 define cc 11 {i32, i32, i32} @addfour(i32 %hp, i32 %p, i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: addfour:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    retl
 entry:

diff  --git a/llvm/test/CodeGen/X86/hipe-cc64.ll b/llvm/test/CodeGen/X86/hipe-cc64.ll
index ed75015d3995a..d8505641cd789 100644
--- a/llvm/test/CodeGen/X86/hipe-cc64.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc64.ll
@@ -40,9 +40,9 @@ entry:
 define cc 11 {i64, i64, i64} @addfour(i64 %hp, i64 %p, i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 ; CHECK-LABEL: addfour:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    leaq (%rdx,%rcx), %rax
-; CHECK-NEXT:    addq %r8, %rax
-; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    addq %rsi, %rdx
+; CHECK-NEXT:    leaq (%rcx,%r8), %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i64 %x, %y

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
index 49eebce001776..873083a0703da 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll
@@ -320,36 +320,36 @@ define i32 @PR37890_v16i32(<16 x i32> %a)  {
 ; SSE2-LABEL: PR37890_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-SLOW-LABEL: PR37890_v16i32:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm1
-; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm0
 ; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-SLOW-NEXT:    movd %xmm1, %eax
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    movd %xmm0, %eax
 ; SSSE3-SLOW-NEXT:    retq
 ;
 ; SSSE3-FAST-LABEL: PR37890_v16i32:
 ; SSSE3-FAST:       # %bb.0:
 ; SSSE3-FAST-NEXT:    paddd %xmm3, %xmm1
-; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm1
-; SSSE3-FAST-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSSE3-FAST-NEXT:    paddd %xmm2, %xmm0
 ; SSSE3-FAST-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-FAST-NEXT:    movd %xmm0, %eax
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSSE3-FAST-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
+; SSSE3-FAST-NEXT:    movd %xmm1, %eax
 ; SSSE3-FAST-NEXT:    retq
 ;
 ; AVX1-SLOW-LABEL: PR37890_v16i32:
@@ -357,8 +357,8 @@ define i32 @PR37890_v16i32(<16 x i32> %a)  {
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
index 3e97c22e48e3d..16f3ef535b0bb 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll
@@ -196,7 +196,7 @@ define double @PR37890_v8f64(<8 x double> %a)  {
 ; SSE2-LABEL: PR37890_v8f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm1
+; SSE2-NEXT:    addpd %xmm2, %xmm0
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    movapd %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -206,7 +206,7 @@ define double @PR37890_v8f64(<8 x double> %a)  {
 ; SSSE3-SLOW-LABEL: PR37890_v8f64:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    addpd %xmm3, %xmm1
-; SSSE3-SLOW-NEXT:    addpd %xmm2, %xmm1
+; SSSE3-SLOW-NEXT:    addpd %xmm2, %xmm0
 ; SSSE3-SLOW-NEXT:    addpd %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -216,7 +216,7 @@ define double @PR37890_v8f64(<8 x double> %a)  {
 ; SSSE3-FAST-LABEL: PR37890_v8f64:
 ; SSSE3-FAST:       # %bb.0:
 ; SSSE3-FAST-NEXT:    addpd %xmm3, %xmm1
-; SSSE3-FAST-NEXT:    addpd %xmm2, %xmm1
+; SSSE3-FAST-NEXT:    addpd %xmm2, %xmm0
 ; SSSE3-FAST-NEXT:    addpd %xmm1, %xmm0
 ; SSSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
 ; SSSE3-FAST-NEXT:    retq
@@ -265,7 +265,7 @@ define float @PR37890_v16f32(<16 x float> %a)  {
 ; SSE2-LABEL: PR37890_v16f32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    addps %xmm3, %xmm1
-; SSE2-NEXT:    addps %xmm2, %xmm1
+; SSE2-NEXT:    addps %xmm2, %xmm0
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -278,7 +278,7 @@ define float @PR37890_v16f32(<16 x float> %a)  {
 ; SSSE3-SLOW-LABEL: PR37890_v16f32:
 ; SSSE3-SLOW:       # %bb.0:
 ; SSSE3-SLOW-NEXT:    addps %xmm3, %xmm1
-; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm1
+; SSSE3-SLOW-NEXT:    addps %xmm2, %xmm0
 ; SSSE3-SLOW-NEXT:    addps %xmm1, %xmm0
 ; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
 ; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
@@ -290,7 +290,7 @@ define float @PR37890_v16f32(<16 x float> %a)  {
 ; SSSE3-FAST-LABEL: PR37890_v16f32:
 ; SSSE3-FAST:       # %bb.0:
 ; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
-; SSSE3-FAST-NEXT:    addps %xmm2, %xmm1
+; SSSE3-FAST-NEXT:    addps %xmm2, %xmm0
 ; SSSE3-FAST-NEXT:    addps %xmm1, %xmm0
 ; SSSE3-FAST-NEXT:    movaps %xmm0, %xmm1
 ; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index b85972e86b05f..706c0b97fe706 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -1321,13 +1321,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v16i32:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxsd %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i32:
@@ -1335,8 +1335,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1393,13 +1393,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v16i32:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxsd %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i32:
@@ -1407,8 +1407,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1463,26 +1463,26 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v32i16:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxsw %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    xorl $32767, %eax ## imm = 0x7FFF
 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1493,8 +1493,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1519,26 +1519,26 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: test_reduce_v32i16:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxsw %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxsw %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pmaxsw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    xorl $32767, %eax ## imm = 0x7FFF
 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1549,8 +1549,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1655,13 +1655,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v64i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    xorb $127, %al
 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1672,8 +1672,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -1749,13 +1749,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v64i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    xorb $127, %al
 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1766,8 +1766,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index f90b951026525..d4c842402c073 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -1325,13 +1325,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v16i32:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminsd %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminsd %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminsd %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pminsd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    pminsd %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i32:
@@ -1339,8 +1339,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminsd %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1397,13 +1397,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v16i32:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminsd %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminsd %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminsd %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT:    pminsd %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE42-NEXT:    pminsd %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i32:
@@ -1411,8 +1411,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminsd %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1467,26 +1467,26 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm2, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v32i16:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminsw %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminsw %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pminsw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    xorl $32768, %eax ## imm = 0x8000
 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1497,8 +1497,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1523,26 +1523,26 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm2, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: test_reduce_v32i16:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminsw %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminsw %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pminsw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    xorl $32768, %eax ## imm = 0x8000
 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1553,8 +1553,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1659,13 +1659,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v64i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminsb %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    addb $-128, %al
 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1676,8 +1676,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -1753,13 +1753,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v64i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminsb %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    addb $-128, %al
 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1770,8 +1770,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index cfedacabc12d0..28abb22f917ed 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -1486,13 +1486,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v16i32:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxud %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxud %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxud %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pmaxud %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    pmaxud %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i32:
@@ -1500,8 +1500,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxud %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1573,13 +1573,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v16i32:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxud %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxud %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxud %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT:    pmaxud %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE42-NEXT:    pmaxud %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i32:
@@ -1587,8 +1587,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxud %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1665,11 +1665,11 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i16:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxuw %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    notl %eax
 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1680,8 +1680,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1730,11 +1730,11 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i16:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxuw %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    notl %eax
 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1745,8 +1745,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1806,33 +1806,33 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxub %xmm3, %xmm1
-; X86-SSE2-NEXT:    pmaxub %xmm2, %xmm1
-; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pmaxub %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $8, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
 ; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v64i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxub %xmm3, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrlw $8, %xmm1
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    notb %al
 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1843,8 +1843,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
@@ -1875,33 +1875,33 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v64i8:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    pmaxub %xmm3, %xmm1
-; X64-SSE2-NEXT:    pmaxub %xmm2, %xmm1
-; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pmaxub %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrlw $8, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
 ; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: test_reduce_v64i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxub %xmm3, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrlw $8, %xmm1
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    notb %al
 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
@@ -1912,8 +1912,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1

diff  --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index c8c6bbf20c834..99492a3c14449 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -1404,13 +1404,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v16i32:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminud %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminud %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminud %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pminud %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    pminud %xmm0, %xmm1
-; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v16i32:
@@ -1418,8 +1418,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminud %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1491,13 +1491,13 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v16i32:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminud %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminud %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminud %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE42-NEXT:    pminud %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE42-NEXT:    pminud %xmm0, %xmm1
-; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v16i32:
@@ -1505,8 +1505,8 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminud %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1589,9 +1589,9 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v32i16:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminuw %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminuw %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE42-NEXT:    retl
@@ -1601,8 +1601,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1651,9 +1651,9 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v32i16:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminuw %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminuw %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
-; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE42-NEXT:    retq
@@ -1663,8 +1663,8 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def $ax killed $ax killed $eax
@@ -1716,31 +1716,31 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminub %xmm3, %xmm1
-; X86-SSE2-NEXT:    pminub %xmm2, %xmm1
-; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pminub %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrlw $8, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
 ; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-SSE42-LABEL: test_reduce_v64i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminub %xmm3, %xmm1
-; X86-SSE42-NEXT:    pminub %xmm2, %xmm1
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movd %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE42-NEXT:    retl
@@ -1750,8 +1750,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vpminub %xmm2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1776,31 +1776,31 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v64i8:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    pminub %xmm3, %xmm1
-; X64-SSE2-NEXT:    pminub %xmm2, %xmm1
-; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X64-SSE2-NEXT:    pminub %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrlw $8, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
 ; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE42-LABEL: test_reduce_v64i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminub %xmm3, %xmm1
-; X64-SSE42-NEXT:    pminub %xmm2, %xmm1
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movd %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE42-NEXT:    retq
@@ -1810,8 +1810,8 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpminub %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 542adaedd5600..8dab14a440b24 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -656,11 +656,11 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm4
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
-; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm5
 ; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm5
+; SSSE3-SLOW-NEXT:    paddd %xmm2, %xmm5
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm1
 ; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
-; SSSE3-SLOW-NEXT:    paddd %xmm3, %xmm6
 ; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm6
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
 ; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
@@ -703,15 +703,15 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX1-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-SLOW-NEXT:    retq
 ;
@@ -727,14 +727,14 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX1-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm3, %xmm1
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-FAST-NEXT:    retq
 ;
@@ -758,9 +758,9 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
 ; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32:
@@ -781,9 +781,9 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
 ; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
 ; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    retq
   %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
   %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5>

diff  --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index 582fae8cf63c4..38815d950a0e6 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -28,7 +28,8 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
 ; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    orl %ecx, %ebp
 ; X86-NEXT:    shrdl $28, %ebx, %ebp
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit

diff  --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll
index 94d786a65ffa8..9131688c4efcc 100644
--- a/llvm/test/CodeGen/X86/imul.ll
+++ b/llvm/test/CodeGen/X86/imul.ll
@@ -450,18 +450,13 @@ define i64 @test6(i64 %a) {
 ;
 ; X86-LABEL: test6:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shll $5, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $33, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 entry:
 	%tmp3 = mul i64 %a, 33

diff  --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
index d63fdcbca79ed..ef8e030d6caf2 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
@@ -73,11 +73,9 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
 ; X64-NEXT:    leal 1(%rax,%rcx), %ecx
 ; X64-NEXT:    leal (%rax,%rax), %edx
 ; X64-NEXT:    addl %eax, %edx
-; X64-NEXT:    addl %eax, %edx
-; X64-NEXT:    addl %eax, %edx
-; X64-NEXT:    addl %eax, %edx
-; X64-NEXT:    addl %ecx, %edx
-; X64-NEXT:    movl %edx, 16(%rdi)
+; X64-NEXT:    addl %edx, %ecx
+; X64-NEXT:    addl %edx, %ecx
+; X64-NEXT:    movl %ecx, 16(%rdi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: foo_loop:
@@ -104,11 +102,9 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
 ; X86-NEXT:    leal 1(%ecx,%esi), %edx
 ; X86-NEXT:    leal (%ecx,%ecx), %esi
 ; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %edx, 16(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll
index b076188c4e918..cec19dcf49c8d 100644
--- a/llvm/test/CodeGen/X86/lea-opt2.ll
+++ b/llvm/test/CodeGen/X86/lea-opt2.ll
@@ -35,11 +35,13 @@ entry:
 define i32 @test2(ptr %p, i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    movl %ecx, (%rdi)
 ; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i32 %a, %b
@@ -53,11 +55,13 @@ entry:
 define i32 @test3(ptr %p, i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    addl %edx, %ecx
-; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    movl %ecx, (%rdi)
 ; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i32 %a, %b
@@ -110,8 +114,8 @@ define i64 @test6(ptr %p, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rdi)
 ; CHECK-NEXT:    subq %rdx, %rax
 ; CHECK-NEXT:    retq
@@ -128,8 +132,8 @@ define i64 @test7(ptr %p, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rdi)
 ; CHECK-NEXT:    subq %rdx, %rax
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll
index 829ed4f748dd3..96e63d1122ec9 100644
--- a/llvm/test/CodeGen/X86/logic-shift.ll
+++ b/llvm/test/CodeGen/X86/logic-shift.ll
@@ -234,8 +234,8 @@ define i64 @or_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 ; CHECK-NEXT:    sarq %cl, %rdi
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rsi
-; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    orq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = ashr i64 %x0, %y
   %sh2 = lshr i64 %x1, %y
@@ -255,8 +255,8 @@ define i64 @or_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w) {
 ; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    movl %r8d, %ecx
 ; CHECK-NEXT:    shrq %cl, %rsi
-; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    orq %rdi, %rax
+; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w
@@ -518,8 +518,8 @@ define i64 @xor_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 ; CHECK-NEXT:    sarq %cl, %rdi
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    shrq %cl, %rsi
-; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    xorq %rdi, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = ashr i64 %x0, %y
   %sh2 = lshr i64 %x1, %y
@@ -539,8 +539,8 @@ define i64 @xor_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w)
 ; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    movl %r8d, %ecx
 ; CHECK-NEXT:    shrq %cl, %rsi
-; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    xorq %rdi, %rax
+; CHECK-NEXT:    xorq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w
@@ -802,8 +802,8 @@ define i64 @and_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 ; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NEXT:    sarq %cl, %rsi
-; CHECK-NEXT:    andq %rsi, %rax
 ; CHECK-NEXT:    andq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x0, %y
   %sh2 = ashr i64 %x1, %y
@@ -823,8 +823,8 @@ define i64 @and_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w)
 ; CHECK-NEXT:    shrq %cl, %rdi
 ; CHECK-NEXT:    movl %r8d, %ecx
 ; CHECK-NEXT:    shrq %cl, %rsi
-; CHECK-NEXT:    andq %rsi, %rax
 ; CHECK-NEXT:    andq %rdi, %rax
+; CHECK-NEXT:    andq %rsi, %rax
 ; CHECK-NEXT:    retq
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w

diff  --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index 6d7394180253b..bed0e3a24a017 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -99,62 +99,56 @@ while.end:                                        ; preds = %while.body, %entry
 define <16 x float> @foo(<16 x float> %x) {
 ; CHECK-LABEL: foo:
 ; CHECK:       ## %bb.0: ## %bb
-; CHECK-NEXT:    movaps %xmm3, %xmm9
-; CHECK-NEXT:    movaps %xmm2, %xmm5
+; CHECK-NEXT:    xorps %xmm5, %xmm5
+; CHECK-NEXT:    cvttps2dq %xmm3, %xmm7
+; CHECK-NEXT:    movaps %xmm3, %xmm4
+; CHECK-NEXT:    cmpltps %xmm5, %xmm4
+; CHECK-NEXT:    movaps {{.*#+}} xmm8 = [13,14,15,16]
+; CHECK-NEXT:    movaps %xmm4, %xmm6
+; CHECK-NEXT:    orps %xmm8, %xmm6
+; CHECK-NEXT:    cvtdq2ps %xmm7, %xmm3
+; CHECK-NEXT:    andps %xmm8, %xmm3
+; CHECK-NEXT:    andps %xmm6, %xmm3
+; CHECK-NEXT:    andnps %xmm4, %xmm6
+; CHECK-NEXT:    cvttps2dq %xmm2, %xmm4
+; CHECK-NEXT:    movaps %xmm2, %xmm7
+; CHECK-NEXT:    cmpltps %xmm5, %xmm7
+; CHECK-NEXT:    movaps {{.*#+}} xmm8 = [9,10,11,12]
+; CHECK-NEXT:    movaps %xmm7, %xmm9
+; CHECK-NEXT:    orps %xmm8, %xmm9
+; CHECK-NEXT:    cvtdq2ps %xmm4, %xmm2
+; CHECK-NEXT:    andps %xmm8, %xmm2
+; CHECK-NEXT:    andps %xmm9, %xmm2
+; CHECK-NEXT:    andnps %xmm7, %xmm9
+; CHECK-NEXT:    cvttps2dq %xmm1, %xmm4
+; CHECK-NEXT:    cmpltps %xmm5, %xmm1
+; CHECK-NEXT:    movaps {{.*#+}} xmm7 = [5,6,7,8]
+; CHECK-NEXT:    movaps %xmm1, %xmm8
+; CHECK-NEXT:    orps %xmm7, %xmm8
+; CHECK-NEXT:    cvtdq2ps %xmm4, %xmm4
+; CHECK-NEXT:    andps %xmm7, %xmm4
+; CHECK-NEXT:    andps %xmm8, %xmm4
+; CHECK-NEXT:    andnps %xmm1, %xmm8
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
+; CHECK-NEXT:    cmpltps %xmm5, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm5 = [1,2,3,4]
 ; CHECK-NEXT:    movaps %xmm0, %xmm7
-; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm3, %xmm2
-; CHECK-NEXT:    cmpltps %xmm0, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm4
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT:    movaps %xmm4, %xmm8
-; CHECK-NEXT:    andnps %xmm2, %xmm8
-; CHECK-NEXT:    movaps %xmm5, %xmm6
-; CHECK-NEXT:    cmpltps %xmm0, %xmm6
-; CHECK-NEXT:    movaps {{.*#+}} xmm11 = [9,10,11,12]
-; CHECK-NEXT:    movaps %xmm6, %xmm2
-; CHECK-NEXT:    orps %xmm11, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm10
-; CHECK-NEXT:    andnps %xmm6, %xmm10
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm12
-; CHECK-NEXT:    cmpltps %xmm0, %xmm1
-; CHECK-NEXT:    movaps {{.*#+}} xmm13 = [5,6,7,8]
-; CHECK-NEXT:    movaps %xmm1, %xmm6
-; CHECK-NEXT:    orps %xmm13, %xmm6
-; CHECK-NEXT:    movaps %xmm6, %xmm14
-; CHECK-NEXT:    andnps %xmm1, %xmm14
-; CHECK-NEXT:    cvttps2dq %xmm7, %xmm3
-; CHECK-NEXT:    cmpltps %xmm0, %xmm7
-; CHECK-NEXT:    movaps {{.*#+}} xmm15 = [1,2,3,4]
-; CHECK-NEXT:    movaps %xmm7, %xmm0
-; CHECK-NEXT:    orps %xmm15, %xmm0
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    andnps %xmm7, %xmm1
-; CHECK-NEXT:    andps %xmm15, %xmm0
-; CHECK-NEXT:    cvtdq2ps %xmm3, %xmm3
-; CHECK-NEXT:    andps %xmm3, %xmm0
-; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
-; CHECK-NEXT:    andps %xmm3, %xmm1
-; CHECK-NEXT:    orps %xmm1, %xmm0
-; CHECK-NEXT:    andps %xmm13, %xmm6
-; CHECK-NEXT:    cvtdq2ps %xmm12, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm6
-; CHECK-NEXT:    andps %xmm3, %xmm14
-; CHECK-NEXT:    orps %xmm14, %xmm6
-; CHECK-NEXT:    andps %xmm11, %xmm2
-; CHECK-NEXT:    cvttps2dq %xmm5, %xmm1
+; CHECK-NEXT:    orps %xmm5, %xmm7
 ; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm2
-; CHECK-NEXT:    andps %xmm3, %xmm10
-; CHECK-NEXT:    orps %xmm10, %xmm2
-; CHECK-NEXT:    andps %xmm3, %xmm8
-; CHECK-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; CHECK-NEXT:    cvttps2dq %xmm9, %xmm1
-; CHECK-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK-NEXT:    andps %xmm1, %xmm4
+; CHECK-NEXT:    andps %xmm5, %xmm1
+; CHECK-NEXT:    andps %xmm7, %xmm1
+; CHECK-NEXT:    andnps %xmm0, %xmm7
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT:    andps %xmm0, %xmm7
+; CHECK-NEXT:    orps %xmm7, %xmm1
+; CHECK-NEXT:    andps %xmm0, %xmm8
 ; CHECK-NEXT:    orps %xmm8, %xmm4
-; CHECK-NEXT:    movaps %xmm6, %xmm1
-; CHECK-NEXT:    movaps %xmm4, %xmm3
+; CHECK-NEXT:    andps %xmm0, %xmm9
+; CHECK-NEXT:    orps %xmm9, %xmm2
+; CHECK-NEXT:    andps %xmm0, %xmm6
+; CHECK-NEXT:    orps %xmm6, %xmm3
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    movaps %xmm4, %xmm1
 ; CHECK-NEXT:    retq
 bb:
   %v3 = icmp slt <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index fd568a4969cd5..4d0045fec2649 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -249,8 +249,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -384,8 +384,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -398,8 +398,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    movl %edx, %eax
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    xorl %ecx, %ecx
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB3_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -411,28 +411,28 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
 ; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
 ; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB3_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -831,8 +831,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -980,8 +980,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -994,8 +994,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    movl %edx, %eax
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    xorl %ecx, %ecx
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB7_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1011,28 +1011,28 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
 ; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    addq $32, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB7_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1457,8 +1457,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1625,8 +1625,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1639,9 +1639,9 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    movl %edx, %eax
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    xorl %ecx, %ecx
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB11_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1669,37 +1669,37 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpmulld %xmm10, %xmm12, %xmm10
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vpmulld %xmm11, %xmm12, %xmm11
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
-; AVX1-NEXT:    vpaddd %xmm3, %xmm11, %xmm3
+; AVX1-NEXT:    vpaddd %xmm4, %xmm8, %xmm4
+; AVX1-NEXT:    vpaddd %xmm3, %xmm9, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm11, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -2661,17 +2661,17 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
 ; SSE2-NEXT:    movdqu (%rdx), %xmm0
 ; SSE2-NEXT:    movdqu (%rcx), %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    movdqu (%r8), %xmm0
-; SSE2-NEXT:    movdqu (%r9), %xmm3
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqu (%r10), %xmm0
-; SSE2-NEXT:    movdqu (%rax), %xmm1
+; SSE2-NEXT:    movdqu (%r9), %xmm1
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movdqu (%r10), %xmm0
+; SSE2-NEXT:    movdqu (%rax), %xmm2
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -2685,13 +2685,13 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqu (%r8), %xmm2
-; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu (%r10), %xmm2
-; AVX-NEXT:    vpmaddwd (%rax), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovdqu (%r8), %xmm1
+; AVX-NEXT:    vpmaddwd (%r9), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%r10), %xmm1
+; AVX-NEXT:    vpmaddwd (%rax), %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -3163,8 +3163,8 @@ define i32 @add_used_by_loop_phi(i8* %a, i8* %b, i64 %offset_a, i64 %offset_b, i
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]

diff  --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 513b608e9ae3c..52d2187b50d61 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1754,8 +1754,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX512F-NEXT:    vpgatherdd c(,%zmm0), %zmm2 {%k2}
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28]
 ; AVX512F-NEXT:    vpgatherdd c(,%zmm0), %zmm1 {%k1}
-; AVX512F-NEXT:    vpaddd %ymm1, %ymm1, %ymm0
-; AVX512F-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddd %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: gather_v8i32_v8i32:
@@ -1768,8 +1768,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX512VL-NEXT:    vpgatherdd c(,%ymm1), %ymm2 {%k2}
 ; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28]
 ; AVX512VL-NEXT:    vpgatherdd c(,%ymm1), %ymm0 {%k1}
-; AVX512VL-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpaddd %ymm0, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
   %1 = icmp eq <8 x i32> %trigger, zeroinitializer
   %2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> getelementptr (%struct.a, <8 x ptr> <ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c, ptr @c>, <8 x i64> zeroinitializer, i32 0, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>), i32 4, <8 x i1> %1, <8 x i32> undef)

diff  --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index bfc98143cc909..1289eef7795dc 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -499,13 +499,13 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
 ; KNL_64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; KNL_64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; KNL_64-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
 ; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -520,8 +520,8 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL_32-NEXT:    movw $255, %ax
 ; KNL_32-NEXT:    kmovw %eax, %k1
@@ -533,10 +533,10 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; SKX_SMALL:       # %bb.0: # %entry
 ; SKX_SMALL-NEXT:    vpbroadcastq %rdi, %zmm2
 ; SKX_SMALL-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; SKX_SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; SKX_SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm1
+; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; SKX_SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -550,8 +550,8 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
 ; SKX_LARGE-NEXT:    vpmuldq (%rax){1to8}, %zmm1, %zmm1
 ; SKX_LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; SKX_LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm1
+; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; SKX_LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -583,13 +583,13 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
 ; KNL_64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
+; KNL_64-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
 ; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; KNL_64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; KNL_64-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
 ; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
 ; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -604,8 +604,8 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
 ; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm1
+; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
+; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL_32-NEXT:    movw $255, %ax
 ; KNL_32-NEXT:    kmovw %eax, %k1
@@ -617,10 +617,10 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; SKX_SMALL:       # %bb.0: # %entry
 ; SKX_SMALL-NEXT:    vpbroadcastq %rdi, %zmm2
 ; SKX_SMALL-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; SKX_SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
 ; SKX_SMALL-NEXT:    vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm1
+; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; SKX_SMALL-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -634,8 +634,8 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
 ; SKX_LARGE-NEXT:    vpmuldq (%rax){1to8}, %zmm1, %zmm1
 ; SKX_LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; SKX_LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm1
+; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; SKX_LARGE-NEXT:    vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -3496,7 +3496,7 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64
 ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    vpgatherqq (,%ymm0), %ymm1 {%k1}
 ; SKX-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
-; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_pr28312:
@@ -3513,7 +3513,7 @@ define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64
 ; SKX_32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; SKX_32-NEXT:    vpgatherdq (,%xmm0), %ymm1 {%k1}
 ; SKX_32-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
-; SKX_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; SKX_32-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    .cfi_def_cfa %esp, 4

diff  --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index f852bc897bd77..c0f8f86e6e8b1 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -1615,12 +1615,12 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
 ; X86-SSE2-NEXT:    retl
@@ -1635,12 +1635,12 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE41-NEXT:    por %xmm0, %xmm2
+; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
@@ -1713,12 +1713,12 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
 ; X86-SSE2-NEXT:    retl
@@ -1733,12 +1733,12 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE41-NEXT:    por %xmm0, %xmm2
+; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
@@ -1777,8 +1777,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
@@ -1793,8 +1793,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm1, %xmm2
 ; X86-SSE41-NEXT:    por %xmm0, %xmm2
 ; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X86-SSE41-NEXT:    setne %al
@@ -1844,22 +1844,22 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
@@ -1868,22 +1868,22 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE41-NEXT:    por %xmm3, %xmm4
-; X86-SSE41-NEXT:    por %xmm1, %xmm4
-; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    por %xmm2, %xmm3
+; X86-SSE41-NEXT:    por %xmm0, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
@@ -1957,9 +1957,9 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
@@ -1976,9 +1976,9 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE41-NEXT:    por %xmm3, %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    por %xmm2, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
 ; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl
@@ -2027,22 +2027,22 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm2, %xmm4
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
@@ -2051,22 +2051,22 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm2, %xmm4
-; X86-SSE41-NEXT:    por %xmm3, %xmm4
-; X86-SSE41-NEXT:    por %xmm1, %xmm4
-; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    por %xmm2, %xmm3
+; X86-SSE41-NEXT:    por %xmm0, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
 ; X86-SSE41-NEXT:    setne %al
 ; X86-SSE41-NEXT:    retl
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
@@ -2140,9 +2140,9 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
@@ -2159,9 +2159,9 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
 ; X86-SSE41-NEXT:    por %xmm3, %xmm2
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    por %xmm2, %xmm1
 ; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
 ; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X86-SSE41-NEXT:    sete %al
 ; X86-SSE41-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index a555cbabd35a9..e7caf88bed308 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -1607,10 +1607,10 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
 ; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm3, %xmm1
 ; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    sete %al
@@ -1625,10 +1625,10 @@ define i1 @length48_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
 ; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm3, %xmm0
 ; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    por %xmm3, %xmm1
 ; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
 ; X64-SSE41-NEXT:    sete %al
 ; X64-SSE41-NEXT:    retq
@@ -1729,10 +1729,10 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
 ; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm3, %xmm1
 ; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    sete %al
@@ -1747,10 +1747,10 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
 ; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm3, %xmm0
 ; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    por %xmm3, %xmm1
 ; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
 ; X64-SSE41-NEXT:    sete %al
 ; X64-SSE41-NEXT:    retq
@@ -1762,8 +1762,8 @@ define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
 ; X64-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
 ; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
 ; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm2
-; X64-AVX-NEXT:    vpor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm1
 ; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vptest %xmm0, %xmm0
 ; X64-AVX-NEXT:    sete %al
@@ -1798,8 +1798,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
@@ -1813,8 +1813,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
 ; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
 ; X64-SSE41-NEXT:    por %xmm0, %xmm2
 ; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X64-SSE41-NEXT:    setne %al
@@ -1893,13 +1893,13 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
 ; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
 ; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
 ; X64-SSE2-NEXT:    movdqu 47(%rsi), %xmm2
 ; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    setne %al
@@ -1915,13 +1915,13 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
 ; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm4, %xmm0
 ; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE41-NEXT:    movdqu 47(%rsi), %xmm2
 ; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
 ; X64-SSE41-NEXT:    por %xmm1, %xmm2
 ; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm4, %xmm2
 ; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X64-SSE41-NEXT:    setne %al
 ; X64-SSE41-NEXT:    retq
@@ -2020,9 +2020,9 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; X64-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    sete %al
@@ -2038,9 +2038,9 @@ define i1 @length63_eq_const(ptr %X) nounwind {
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; X64-SSE41-NEXT:    por %xmm3, %xmm2
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    por %xmm2, %xmm1
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X64-SSE41-NEXT:    sete %al
 ; X64-SSE41-NEXT:    retq
@@ -2118,13 +2118,13 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
 ; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
 ; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
 ; X64-SSE2-NEXT:    movdqu 48(%rsi), %xmm2
 ; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    setne %al
@@ -2140,13 +2140,13 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
 ; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
 ; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
 ; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm4, %xmm0
 ; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
 ; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE41-NEXT:    movdqu 48(%rsi), %xmm2
 ; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
 ; X64-SSE41-NEXT:    por %xmm1, %xmm2
 ; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm4, %xmm2
 ; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
 ; X64-SSE41-NEXT:    setne %al
 ; X64-SSE41-NEXT:    retq
@@ -2260,9 +2260,9 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; X64-SSE2-NEXT:    pand %xmm3, %xmm2
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-SSE2-NEXT:    sete %al
@@ -2278,9 +2278,9 @@ define i1 @length64_eq_const(ptr %X) nounwind {
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; X64-SSE41-NEXT:    por %xmm3, %xmm2
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    por %xmm2, %xmm1
 ; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
+; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
 ; X64-SSE41-NEXT:    sete %al
 ; X64-SSE41-NEXT:    retq
@@ -2380,8 +2380,8 @@ define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
 ; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm1
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    setne %al
@@ -2395,8 +2395,8 @@ define i1 @length96_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
 ; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm1
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    setne %al
@@ -2509,8 +2509,8 @@ define i1 @length96_eq_const(ptr %X) nounwind {
 ; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    sete %al
@@ -2524,8 +2524,8 @@ define i1 @length96_eq_const(ptr %X) nounwind {
 ; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    sete %al
@@ -2616,9 +2616,9 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    setne %al
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2634,9 +2634,9 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    setne %al
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2752,9 +2752,9 @@ define i1 @length127_eq_const(ptr %X) nounwind {
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    sete %al
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2770,9 +2770,9 @@ define i1 @length127_eq_const(ptr %X) nounwind {
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    sete %al
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2866,9 +2866,9 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    setne %al
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2884,9 +2884,9 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
 ; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    setne %al
 ; X64-AVX2-NEXT:    vzeroupper
@@ -3002,9 +3002,9 @@ define i1 @length128_eq_const(ptr %X) nounwind {
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX1-NEXT:    sete %al
 ; X64-AVX1-NEXT:    vzeroupper
@@ -3020,9 +3020,9 @@ define i1 @length128_eq_const(ptr %X) nounwind {
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
 ; X64-AVX2-NEXT:    sete %al
 ; X64-AVX2-NEXT:    vzeroupper

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index dea35ea11c229..83471b902be40 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -907,8 +907,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE2-NEXT:    paddq %xmm3, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_reg:
@@ -938,16 +938,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm3
+; SSE41-NEXT:    pmuludq %xmm0, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    psllq $32, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
@@ -967,8 +967,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
@@ -986,8 +986,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_reg_reg:
@@ -1006,8 +1006,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_reg_reg:
@@ -1029,8 +1029,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1065,8 +1065,8 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
@@ -1122,8 +1122,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE2-NEXT:    paddq %xmm3, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1153,16 +1153,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm3
+; SSE41-NEXT:    pmuludq %xmm0, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    psllq $32, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
@@ -1186,8 +1186,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1209,8 +1209,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1229,8 +1229,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1252,8 +1252,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1288,8 +1288,8 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp ugt <2 x i64> %a1, %a2
@@ -1338,16 +1338,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; SSE2-NEXT:    pandn %xmm0, %xmm3
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    psubq %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm0
 ; SSE2-NEXT:    psrlq $33, %xmm3
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $32, %xmm4
+; SSE2-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    psllq $32, %xmm4
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
@@ -1380,16 +1380,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psrlq $1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm1
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm1
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    paddq %xmm1, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    retq
@@ -1410,8 +1410,8 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
@@ -1430,8 +1430,8 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_mem_reg:
@@ -1451,8 +1451,8 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_mem_reg:
@@ -1474,8 +1474,8 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX512F-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1511,8 +1511,8 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <2 x i64>, ptr %a1_addr
@@ -1570,8 +1570,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm3, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_mem:
@@ -1601,16 +1601,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    psubq %xmm5, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm2
-; SSE41-NEXT:    psrlq $1, %xmm2
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    paddq %xmm3, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    retq
@@ -1631,8 +1631,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
@@ -1651,8 +1651,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_reg_mem:
@@ -1672,8 +1672,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_reg_mem:
@@ -1695,8 +1695,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1732,8 +1732,8 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a2 = load <2 x i64>, ptr %a2_addr
@@ -1782,16 +1782,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-NEXT:    pandn %xmm0, %xmm3
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    psubq %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrlq $1, %xmm0
 ; SSE2-NEXT:    psrlq $33, %xmm3
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
-; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $32, %xmm4
+; SSE2-NEXT:    pmuludq %xmm0, %xmm4
+; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    psllq $32, %xmm4
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
@@ -1824,16 +1824,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    psubq %xmm5, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm1
-; SSE41-NEXT:    psrlq $1, %xmm1
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    pmuludq %xmm0, %xmm1
+; SSE41-NEXT:    paddq %xmm3, %xmm1
+; SSE41-NEXT:    psllq $32, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1855,8 +1855,8 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
@@ -1876,8 +1876,8 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_mem_mem:
@@ -1898,8 +1898,8 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_mem_mem:
@@ -1921,8 +1921,8 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1959,8 +1959,8 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <2 x i64>, ptr %a1_addr
@@ -2107,9 +2107,9 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psubusw %xmm1, %xmm3
 ; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
 ; SSE2-NEXT:    psubw %xmm0, %xmm3
 ; SSE2-NEXT:    paddw %xmm1, %xmm3
-; SSE2-NEXT:    paddw %xmm0, %xmm3
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
 ; SSE2-NEXT:    paddw %xmm3, %xmm0

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index f7505eddacb2c..7b54c1b355374 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -504,10 +504,10 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -527,8 +527,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_reg:
@@ -564,10 +564,10 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
 ; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -604,10 +604,10 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
 ; XOPAVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -630,8 +630,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_reg_reg:
@@ -665,8 +665,8 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
   %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
@@ -718,10 +718,10 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm6, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm8, %xmm2
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm8, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -745,8 +745,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg:
@@ -782,10 +782,10 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
 ; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -822,10 +822,10 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
 ; XOPAVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -848,8 +848,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_unsigned_reg_reg:
@@ -883,8 +883,8 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp ugt <4 x i64> %a1, %a2
   %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
@@ -932,10 +932,10 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm4, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -956,8 +956,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_reg:
@@ -994,10 +994,10 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
 ; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm4, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1035,10 +1035,10 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
 ; XOPAVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1061,8 +1061,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; AVX512F-NEXT:    vpsllq $32, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_mem_reg:
@@ -1097,8 +1097,8 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <4 x i64>, ptr %a1_addr
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
@@ -1145,10 +1145,10 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -1169,8 +1169,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_mem:
@@ -1207,10 +1207,10 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1248,10 +1248,10 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
 ; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1274,8 +1274,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_reg_mem:
@@ -1310,8 +1310,8 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a2 = load <4 x i64>, ptr %a2_addr
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
@@ -1359,10 +1359,10 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm7, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -1384,8 +1384,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_mem:
@@ -1423,10 +1423,10 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1465,10 +1465,10 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
 ; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1491,8 +1491,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_mem_mem:
@@ -1528,8 +1528,8 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <4 x i64>, ptr %a1_addr
   %a2 = load <4 x i64>, ptr %a2_addr

diff  --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index 11c216e67bc82..601166d67f6f2 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -304,41 +304,40 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    movzbl %dl, %ebx
 ; X86-NEXT:    jl .LBB5_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB5_3:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -377,43 +376,42 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jmp .LBB6_3
 ; X86-NEXT:  .LBB6_1:
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB6_3:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    orl $1, %ebx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    orl $1, %ebp
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -458,39 +456,39 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %esi
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    movzbl %dl, %ebx
 ; X86-NEXT:    jl .LBB7_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    jmp .LBB7_3
 ; X86-NEXT:  .LBB7_1:
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB7_3:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -533,42 +531,41 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %eax
-; X86-NEXT:    movl 4(%ecx), %edi
-; X86-NEXT:    cmpl %ebp, %eax
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %eax
+; X86-NEXT:    movl 4(%edx), %edi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    setl %dl
+; X86-NEXT:    movzbl %dl, %ebx
 ; X86-NEXT:    jl .LBB8_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jmp .LBB8_3
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    subl %esi, %eax
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -613,40 +610,40 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl 4(%eax), %esi
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    movl 4(%eax), %ecx
 ; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    movzbl %dl, %ebx
 ; X86-NEXT:    jl .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %ebx, %eax
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    shrdl $1, %edi, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 9ed2d40f3b54d..37a73fb961027 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -130,9 +130,9 @@ define i1 @allones_v64i8_sign(<64 x i8> %arg) {
 ; SSE-LABEL: allones_v64i8_sign:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -141,9 +141,9 @@ define i1 @allones_v64i8_sign(<64 x i8> %arg) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
 ; AVX1-NEXT:    sete %al
@@ -186,9 +186,9 @@ define i1 @allzeros_v64i8_sign(<64 x i8> %arg) {
 ; SSE-LABEL: allzeros_v64i8_sign:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -198,8 +198,8 @@ define i1 @allzeros_v64i8_sign(<64 x i8> %arg) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
 ; AVX1-NEXT:    sete %al
@@ -1322,10 +1322,10 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
 ; SSE-LABEL: allones_v64i8_and1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    psllw $7, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    psllw $7, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -1334,9 +1334,9 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
@@ -1383,10 +1383,10 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
 ; SSE-LABEL: allzeros_v64i8_and1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    psllw $7, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -1396,8 +1396,8 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
@@ -2611,10 +2611,10 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
 ; SSE-LABEL: allones_v64i8_and4:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    psllw $5, %xmm0
-; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    psllw $5, %xmm1
+; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -2623,9 +2623,9 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
@@ -2672,10 +2672,10 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
 ; SSE-LABEL: allzeros_v64i8_and4:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    psllw $5, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    psllw $5, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -2685,8 +2685,8 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax

diff  --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index fa709689afae2..27e32e5613bcf 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -497,18 +497,13 @@ define i64 @test_mul_by_16(i64 %x) {
 define i64 @test_mul_by_17(i64 %x) {
 ; X86-LABEL: test_mul_by_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $17, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_17:
@@ -697,8 +692,8 @@ define i64 @test_mul_by_22(i64 %x) {
 ; X86-NEXT:    leal (%ecx,%eax,4), %esi
 ; X86-NEXT:    movl $22, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -999,8 +994,8 @@ define i64 @test_mul_by_29(i64 %x) {
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
@@ -1522,8 +1517,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    imull %ecx, %edi
-; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1558,8 +1553,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NOOPT-NEXT:    movl %esi, %eax
 ; X86-NOOPT-NEXT:    mull %edi
 ; X86-NOOPT-NEXT:    imull %esi, %ebx
+; X86-NOOPT-NEXT:    addl %ebx, %edx
 ; X86-NOOPT-NEXT:    imull %ecx, %edi
-; X86-NOOPT-NEXT:    addl %ebx, %edi
 ; X86-NOOPT-NEXT:    addl %edi, %edx
 ; X86-NOOPT-NEXT:    popl %esi
 ; X86-NOOPT-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index 59ed19841bd5a..beb2dba05e85a 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -524,15 +524,18 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 define i32 @foo() local_unnamed_addr #0 {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    .cfi_offset %esi, -16
-; X86-NEXT:    .cfi_offset %edi, -12
-; X86-NEXT:    .cfi_offset %ebx, -8
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    pushl $0
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $1
@@ -549,8 +552,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $2, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $2, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    pushl $1
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $3
@@ -558,9 +562,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $3, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $3, %ebx
 ; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $4
@@ -568,9 +571,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $4, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $4, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    pushl $2
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $5
@@ -580,7 +584,6 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorl $5, %edi
-; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $3
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $6
@@ -601,6 +604,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorl $7, %edi
 ; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $8
@@ -610,7 +614,6 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    xorl $8, %ebx
-; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    pushl $4
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $9
@@ -618,9 +621,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $9, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $9, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $10
@@ -630,7 +633,7 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    xorl $10, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    pushl $5
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $11
@@ -638,9 +641,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $11, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $11, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $12
@@ -650,7 +654,6 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    xorl $12, %ebx
-; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    pushl $6
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $13
@@ -678,9 +681,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $15, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    xorl $15, %ebp
+; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $16
@@ -688,9 +691,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $16, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $16, %edi
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    pushl $8
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $17
@@ -698,9 +702,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $17, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $17, %ebx
 ; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $18
@@ -708,9 +711,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $18, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $18, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    pushl $9
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $19
@@ -718,9 +721,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $19, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $19, %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $20
@@ -728,9 +731,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $20, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $20, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    pushl $10
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $21
@@ -738,9 +741,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $21, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $21, %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $22
@@ -748,9 +751,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $22, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $22, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    pushl $11
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $23
@@ -758,9 +762,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $23, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $23, %ebx
 ; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $24
@@ -768,9 +771,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $24, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $24, %edi
+; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $12
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $25
@@ -778,9 +781,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $25, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $25, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $26
@@ -788,9 +791,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $26, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl $26, %edi
+; X86-NEXT:    orl %ebx, %edi
 ; X86-NEXT:    pushl $13
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $27
@@ -798,9 +801,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $27, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    xorl $27, %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $28
@@ -808,9 +811,9 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    xorl $28, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    xorl $28, %ebp
+; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    pushl $14
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $29
@@ -820,7 +823,8 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorl $29, %edi
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $30
@@ -830,7 +834,6 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    xorl $30, %ebx
-; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    pushl $15
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $31
@@ -838,10 +841,10 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    calll mult at PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    pushl $16
 ; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl $32
@@ -851,15 +854,17 @@ define i32 @foo() local_unnamed_addr #0 {
 ; X86-NEXT:    .cfi_adjust_cfa_offset -8
 ; X86-NEXT:    xorl $32, %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;

diff  --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index fc40f50ab4e5a..c6801b5757ea1 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -10,111 +10,112 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    subl $400, %esp # imm = 0x190
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 60(%ecx), %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 56(%ecx), %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%ebp), %esi
+; X32-NEXT:    movl 60(%ecx), %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 56(%ecx), %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl 4(%ebp), %ecx
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %ebp
+; X32-NEXT:    movl %ecx, %edi
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 48(%esi), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 48(%edi), %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 52(%esi), %eax
+; X32-NEXT:    movl 52(%edi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl 12(%eax), %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -122,165 +123,166 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 40(%esi), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl 40(%esi), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 44(%esi), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl 44(%esi), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 32(%edi), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl 32(%ebp), %edi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl 36(%edi), %edi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 36(%ebp), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %edi, %eax
@@ -288,7 +290,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -298,66 +300,67 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 16(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 16(%eax), %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 20(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl 20(%eax), %edx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %cl
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %edi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -365,78 +368,77 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 24(%eax), %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edi, %ebp
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movzbl %bl, %edi
+; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -444,71 +446,72 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -516,74 +519,73 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X32-NEXT:    adcl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -593,115 +595,116 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 24(%esi), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 28(%esi), %ebp
+; X32-NEXT:    movl 24(%esi), %ebp
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl 28(%esi), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 16(%esi), %edi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 16(%edi), %esi
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 20(%esi), %eax
+; X32-NEXT:    movl 20(%edi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -712,89 +715,89 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 8(%esi), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%esi), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 12(%esi), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%esi), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl (%esi), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl 4(%esi), %eax
+; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl (%ebp), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 4(%ebp), %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -802,43 +805,44 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -846,33 +850,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ebx, %eax
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X32-NEXT:    adcl %edi, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -880,130 +884,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %edi
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edi, %ebp
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movzbl %bl, %edi
+; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -1018,41 +1022,42 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
@@ -1062,98 +1067,96 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -1170,7 +1173,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebx
@@ -1184,9 +1187,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
@@ -1201,50 +1204,53 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl 32(%ebp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl 32(%ebx), %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl 36(%ebp), %ebp
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 36(%eax), %ecx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
@@ -1252,8 +1258,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %ebx, %ecx
@@ -1265,24 +1270,23 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 40(%eax), %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl 40(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 44(%eax), %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl 44(%eax), %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
@@ -1290,17 +1294,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1317,20 +1320,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1338,60 +1341,61 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1400,75 +1404,76 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebx, %ecx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1478,34 +1483,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 48(%eax), %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 48(%eax), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ecx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 52(%eax), %edx
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
@@ -1513,29 +1517,29 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1550,34 +1554,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 60(%eax), %ecx
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1589,25 +1593,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl %bl, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -1621,7 +1625,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %ebx, %edx
+; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %ebp, %ecx
 ; X32-NEXT:    adcl $0, %ecx
@@ -1664,27 +1668,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
@@ -1704,30 +1708,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1739,30 +1743,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    movl %ecx, %ebx
-; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    movl %ecx, %edi
+; X32-NEXT:    adcl %eax, %edi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1775,11 +1779,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1790,7 +1793,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -1803,7 +1806,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1847,54 +1849,55 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -1902,24 +1905,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1927,7 +1930,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -1947,7 +1950,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1957,267 +1960,268 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl %ebp, %esi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -2230,63 +2234,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %ebx, %edx
+; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edi, %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
@@ -2296,83 +2299,82 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    adcl %eax, %ebp
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2384,14 +2386,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl %eax, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2418,141 +2420,143 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 64(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 68(%eax), %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl 68(%eax), %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebp
-; X32-NEXT:    setb %cl
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 72(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl 72(%eax), %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 76(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl 76(%eax), %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -2562,96 +2566,97 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2659,44 +2664,42 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl %bl, %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ebx, %eax
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X32-NEXT:    adcl %edi, %eax
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2704,63 +2707,64 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 80(%eax), %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 84(%eax), %ecx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ecx, %ebp
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2768,39 +2772,38 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 88(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 92(%eax), %edi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2813,26 +2816,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %edi, %ecx
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -2845,16 +2848,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %ebx, %edx
+; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -2864,130 +2866,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    adcl %eax, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2997,14 +2999,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -3039,7 +3041,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
@@ -3086,7 +3088,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
@@ -3135,52 +3137,56 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %esi, %ebp
-; X32-NEXT:    addl %edx, %ebp
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %edx, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3190,48 +3196,50 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %edx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %ecx, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -3240,79 +3248,79 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 104(%ecx), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl 104(%esi), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl 108(%ecx), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 108(%esi), %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %edi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 96(%ecx), %esi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl 96(%esi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 100(%ecx), %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 100(%esi), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3323,31 +3331,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3358,21 +3367,21 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    adcl %edi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -3380,98 +3389,101 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 112(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull %eax, %edi
+; X32-NEXT:    movl 112(%ecx), %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 116(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %ebx
-; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 120(%ecx), %eax
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %ecx
-; X32-NEXT:    movl 124(%edx), %edi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %edi
-; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl 124(%esi), %esi
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %ecx, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %esi
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3480,10 +3492,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movzbl %bl, %edi
 ; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -3505,7 +3516,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -3542,7 +3553,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
@@ -3633,7 +3644,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -3653,7 +3664,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebx, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -3678,7 +3689,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
@@ -3708,7 +3719,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
@@ -3803,7 +3814,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3948,7 +3959,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3983,7 +3994,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -4006,6 +4017,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
@@ -4015,18 +4027,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -4048,9 +4060,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -4081,7 +4093,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4108,7 +4120,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
@@ -4140,7 +4152,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
@@ -4161,14 +4173,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 104(%eax), %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
@@ -4190,7 +4202,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %esi
@@ -4207,7 +4219,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4239,106 +4251,110 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %edi
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %edx
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 124(%edi), %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl 120(%edi), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl 120(%ebx), %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull (%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl 112(%edi), %ecx
-; X32-NEXT:    movl 116(%edi), %edi
+; X32-NEXT:    movl 124(%ebx), %eax
+; X32-NEXT:    imull %ecx, %eax
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    movl 112(%ebx), %edi
+; X32-NEXT:    movl 116(%ebx), %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    imull %edi, %edx
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %ebx
-; X32-NEXT:    addl %edx, %ebx
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edx, %ebx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    imull %ebp, %ebx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ebx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -4348,7 +4364,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4377,7 +4393,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4411,7 +4427,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -4445,9 +4461,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -4476,98 +4492,101 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %edi
-; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    imull %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %ecx
-; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edi
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    imull %esi, %edi
 ; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    imull %esi, %ecx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    imull %edi, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %ecx
-; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ebp, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ebp, %ebx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
@@ -4583,7 +4602,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4605,9 +4624,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -4618,9 +4637,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4658,9 +4677,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4698,7 +4717,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4787,198 +4806,198 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq 40(%rdi), %rbx
-; X64-NEXT:    movq 32(%rdi), %r12
-; X64-NEXT:    movq 56(%rdi), %r14
+; X64-NEXT:    movq 32(%rdi), %r14
+; X64-NEXT:    movq 56(%rdi), %r15
 ; X64-NEXT:    movq 48(%rdi), %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %r11
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    movq 8(%rsi), %r8
+; X64-NEXT:    movq %rsi, %r12
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r9, %r8
-; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r9, %rcx
+; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %r9d
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r10, %rsi
+; X64-NEXT:    adcq %r9, %r13
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r10, %rdi
-; X64-NEXT:    adcq %r9, %r15
-; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %rbx, %r15
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %r9, %r11
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r9, %rbx
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r8, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%r13), %rcx
+; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %rbp
+; X64-NEXT:    movq 16(%r12), %r8
+; X64-NEXT:    movq %r14, %r10
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 24(%r13), %r13
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq 24(%r12), %rbp
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    adcq %r9, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %r12, %r9
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r8
-; X64-NEXT:    addq %rbx, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rax, %rdi
+; X64-NEXT:    addq %rbx, %rcx
+; X64-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq %r11, %r14
-; X64-NEXT:    movq %r14, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq %r15, %r8
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq %r13, %rdi
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movzbl %dil, %esi
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r8, %r11
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movzbl %sil, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r9, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rdi, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r10b, %esi
-; X64-NEXT:    adcq %rsi, %rax
+; X64-NEXT:    movzbl %r10b, %ecx
+; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 16(%rcx), %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq 16(%r8), %rsi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq 24(%rcx), %r14
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq 24(%r8), %r14
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %r13, %rbx
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rcx, %r11
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    adcq %rdi, %r11
+; X64-NEXT:    addq %r11, %rsi
+; X64-NEXT:    adcq %rdi, %rbx
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %r13, %r12
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r11, %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rbx, %rdi
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    movq (%rcx), %r13
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    movq (%r8), %r13
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq 8(%rcx), %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq 8(%r8), %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r11, %r14
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    movq %r12, %r11
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r8, %rbp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r12, %rbx
@@ -4986,32 +5005,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r11
 ; X64-NEXT:    addq %r9, %rbx
 ; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %r13, %r10
 ; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rsi, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    adcq %r9, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r12, %rsi
@@ -5023,199 +5042,203 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    adcq %rbp, %r9
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    adcq %rcx, %r9
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r11, %rcx
-; X64-NEXT:    adcq %r8, %rbx
+; X64-NEXT:    addq %rbx, %rcx
+; X64-NEXT:    adcq %r11, %r14
 ; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movzbl %r11b, %edi
 ; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    addq %rsi, %r12
 ; X64-NEXT:    adcq %r9, %rcx
 ; X64-NEXT:    movzbl %r10b, %esi
 ; X64-NEXT:    adcq %rsi, %rax
 ; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq (%rsp), %rax # 8-byte Folded Reload
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq (%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq 32(%r10), %rcx
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq 32(%rcx), %rdi
+; X64-NEXT:    movq %r8, %r10
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %r8
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 40(%r10), %rcx
-; X64-NEXT:    movq %r10, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r12
+; X64-NEXT:    movq 40(%rcx), %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    adcq %r9, %rbx
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rbx, %r11
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %rbx, %r13
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r15, %rbx
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r15
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    adcq %r14, %r10
+; X64-NEXT:    setb %r15b
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r15, %r14
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    addq %r10, %r14
+; X64-NEXT:    movzbl %r15b, %eax
 ; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    addq %r8, %r14
 ; X64-NEXT:    adcq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 48(%rdi), %r12
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq 48(%rcx), %rcx
+; X64-NEXT:    movq %rbp, %r15
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq 56(%rdi), %rsi
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq 56(%r8), %rsi
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    adcq %r15, %rcx
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %r13, %r12
+; X64-NEXT:    adcq %r10, %r15
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rcx, %r13
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    addq %r15, %r13
+; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    addq %r14, %rbp
-; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    movq %rbp, %r8
+; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    adcq %rbx, %rdi
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq %r11, %r13
 ; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbx, %r12
-; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    setb %r15b
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rbx, %r9
+; X64-NEXT:    adcq %r10, %r15
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    movzbl %r15b, %eax
+; X64-NEXT:    addq %r15, %rbx
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq %r13, %rbp
-; X64-NEXT:    adcq %rsi, %r12
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    addq %r13, %r12
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    movzbl %bpl, %eax
 ; X64-NEXT:    adcq %rax, %rbx
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
@@ -5224,127 +5247,127 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rcx, %r10
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    addq %r10, %r12
+; X64-NEXT:    adcq %rsi, %r15
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r15, %rsi
+; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rcx, %r13
+; X64-NEXT:    addq %r10, %r13
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, %r11
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %rcx
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    adcq %r15, %r10
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    addq %r10, %rbp
+; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %r13
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    addq %rdi, %rbp
 ; X64-NEXT:    adcq %r12, %r13
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r11, %r10
+; X64-NEXT:    movq %r11, %r8
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rdi, %r15
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    adcq %r10, %r8
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rcx, %r12
+; X64-NEXT:    addq %r8, %r12
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %r8
+; X64-NEXT:    adcq %rax, %rdi
 ; X64-NEXT:    addq %rbp, %r11
 ; X64-NEXT:    adcq %r13, %r15
 ; X64-NEXT:    movq %r15, %rbp
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    addq %rdi, %r12
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rsi, %r12
+; X64-NEXT:    adcq %r9, %rdi
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r8, %rax
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r12, %r10
-; X64-NEXT:    adcq %r8, %rdi
-; X64-NEXT:    movzbl %r9b, %ecx
+; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    adcq $0, %rdx
@@ -5359,342 +5382,350 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq 64(%r9), %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq 64(%r10), %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r15
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq 72(%r10), %rcx
-; X64-NEXT:    movq %r10, %rsi
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq 72(%r9), %rsi
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r9, %rbx
-; X64-NEXT:    adcq %r8, %rcx
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    addq %r8, %rbx
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r13
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    addq %r10, %r9
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    adcq %rax, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r15, %rdi
 ; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r13, %r12
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r15, %rax
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %rcx
+; X64-NEXT:    adcq %r10, %r8
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, %r13
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    addq %r8, %rbp
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    addq %r11, %rbp
 ; X64-NEXT:    adcq %rbx, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    movq 80(%rsi), %r15
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movq 80(%rcx), %r15
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq 88(%r10), %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r11, %rcx
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r8, %r11
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq 88(%rbx), %rbx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    adcq %r10, %r8
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rcx, %r13
-; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    addq %r8, %r13
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbp, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    addq %r9, %r13
-; X64-NEXT:    adcq %rdi, %r12
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    setb %bpl
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    adcq %r8, %rdi
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movzbl %r8b, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %rsi
+; X64-NEXT:    addq %r13, %rsi
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r9b, %ecx
+; X64-NEXT:    adcq %r12, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl %bpl, %ecx
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %r10
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rax, %rbx
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %r15
-; X64-NEXT:    addq %r10, %r15
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rbx, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %r15
 ; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    imulq %r14, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r10, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %rbx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    imulq %rdi, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq %r15, %r10
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    imulq %rsi, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    addq %r8, %rdi
+; X64-NEXT:    adcq %r15, %rbx
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r15, %r13
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r13, %r15
-; X64-NEXT:    adcq %r12, %rbp
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    addq %r8, %r15
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r15, %r11
+; X64-NEXT:    adcq %r10, %r8
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rbp, %r12
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %r8, %r15
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %r8, %r12
-; X64-NEXT:    adcq %r10, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq 120(%rdx), %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rcx
-; X64-NEXT:    movq 112(%rdx), %r9
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %rdi, %r15
+; X64-NEXT:    adcq %rbx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq 112(%r9), %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %rbx, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    addq %rdx, %r9
-; X64-NEXT:    movq 96(%rdi), %rbp
-; X64-NEXT:    movq 104(%rdi), %rdi
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq 120(%r9), %rax
 ; X64-NEXT:    imulq %rdi, %rax
-; X64-NEXT:    imulq %rbp, %r11
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdi, %rbp
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    movq 96(%r9), %r10
+; X64-NEXT:    movq 104(%r9), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    imulq %rdi, %r12
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdx, %r11
+; X64-NEXT:    addq %r12, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    imulq %r10, %r14
+; X64-NEXT:    addq %rdx, %r14
 ; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %r9, %r11
-; X64-NEXT:    movq %r11, %r14
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    adcq %rbx, %r14
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r9, %r8
-; X64-NEXT:    adcq %rcx, %rbp
-; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r12, %rbx
+; X64-NEXT:    adcq %rbp, %r10
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    movzbl %r8b, %edi
+; X64-NEXT:    adcq %rdi, %rdx
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    adcq %r14, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %r8
-; X64-NEXT:    adcq %r12, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r11, %rbx
+; X64-NEXT:    adcq %r15, %rax
 ; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 80(%r8), %r9
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq 80(%r13), %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 88(%r8), %rbx
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 88(%r13), %r11
+; X64-NEXT:    movq %r13, %r10
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    movq %rsi, %r9
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    movq 64(%r8), %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    movq 64(%r10), %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 72(%r8), %rax
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq 72(%rdi), %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r12
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r8
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r11, %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r12, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r15, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r13, %r11
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq %r8, %r11
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %r10, %rbp
+; X64-NEXT:    addq %rbx, %rbp
 ; X64-NEXT:    adcq %r14, %r12
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, %rbx
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %r14
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r14, %r8
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    adcq %r10, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
@@ -5703,26 +5734,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r10
 ; X64-NEXT:    addq %rbp, %r9
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r12, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    adcq %r13, %r10
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r13
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r9, %rax
@@ -5734,8 +5765,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r14, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r14, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %dil, %ecx
@@ -5743,163 +5774,160 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 96(%r8), %rcx
-; X64-NEXT:    imulq %rcx, %r15
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %r13, %r9
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq 104(%r8), %rdi
-; X64-NEXT:    imulq %rdi, %r9
-; X64-NEXT:    addq %r15, %r9
-; X64-NEXT:    addq %rdx, %r9
-; X64-NEXT:    movq 112(%r8), %rax
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %r11, %rdx
-; X64-NEXT:    movq 120(%r8), %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %r8
-; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, %r9
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq 96(%rdi), %rsi
+; X64-NEXT:    imulq %rsi, %r15
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r15, %rdx
+; X64-NEXT:    movq 104(%rdi), %r9
+; X64-NEXT:    imulq %r9, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq 112(%rdi), %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    movq 120(%rdi), %rdi
+; X64-NEXT:    imulq %r15, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    addq %r10, %r8
+; X64-NEXT:    adcq %r14, %rdi
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %r10, %r13
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r10, %r12
-; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %r13, %rbp
+; X64-NEXT:    adcq %r14, %rcx
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    addq %r13, %r14
-; X64-NEXT:    adcq %r8, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    imulq %r15, %rcx
-; X64-NEXT:    addq %rsi, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    imulq %r15, %rdi
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    imulq %rdi, %rbx
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    imulq %r12, %rsi
+; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    imulq %r9, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    imulq %r11, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %r9, %rcx
+; X64-NEXT:    addq %rcx, %r13
 ; X64-NEXT:    adcq %r8, %rbx
-; X64-NEXT:    movq %rbx, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r8, %rdi
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    addq %rsi, %r11
+; X64-NEXT:    adcq %rdi, %r8
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    addq %r8, %rax
 ; X64-NEXT:    movzbl %sil, %esi
 ; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r12, %r11
+; X64-NEXT:    addq %r13, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %rbp, %r11
 ; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    movq %rdi, %r9
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, (%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 8(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 16(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 24(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 32(%rcx)
-; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 40(%rcx)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 48(%rcx)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, 56(%rcx)
-; X64-NEXT:    movq %rdi, 64(%rcx)
-; X64-NEXT:    movq %r8, 72(%rcx)
-; X64-NEXT:    movq %r9, 80(%rcx)
-; X64-NEXT:    movq %r10, 88(%rcx)
-; X64-NEXT:    movq %r13, 96(%rcx)
-; X64-NEXT:    movq %r11, 104(%rcx)
-; X64-NEXT:    movq %rax, 112(%rcx)
-; X64-NEXT:    movq %rdx, 120(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, (%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 8(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 16(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 24(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 32(%rsi)
+; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 40(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 48(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 56(%rsi)
+; X64-NEXT:    movq %r8, 64(%rsi)
+; X64-NEXT:    movq %r9, 72(%rsi)
+; X64-NEXT:    movq %r10, 80(%rsi)
+; X64-NEXT:    movq %rbx, 88(%rsi)
+; X64-NEXT:    movq %rcx, 96(%rsi)
+; X64-NEXT:    movq %r11, 104(%rsi)
+; X64-NEXT:    movq %rax, 112(%rsi)
+; X64-NEXT:    movq %rdx, 120(%rsi)
 ; X64-NEXT:    addq $240, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12

diff  --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 434d14ad4abed..83e91063cf84f 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -23,141 +23,140 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 12(%ecx), %ebp
-; X32-NEXT:    movl 8(%ecx), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%eax), %ebx
+; X32-NEXT:    movl 12(%ecx), %esi
+; X32-NEXT:    movl 8(%ecx), %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 4(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl 4(%eax), %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl (%edi), %esi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl (%edi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 4(%edi), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl 4(%edi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 8(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 12(%eax), %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebx, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %ebx
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -167,41 +166,43 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 16(%ecx), %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ebx
+; X32-NEXT:    movl %ebx, %esi
+; X32-NEXT:    imull %edi, %esi
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl 20(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 20(%ecx), %ebp
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %ebx, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    imull %eax, %ebp
+; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl 24(%ecx), %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
-; X32-NEXT:    movl 28(%ecx), %ecx
+; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 28(%ecx), %ecx
+; X32-NEXT:    imull %esi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
@@ -213,84 +214,85 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %bl, %esi
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 28(%edi), %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %ecx
-; X32-NEXT:    movl 24(%edi), %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 24(%edi), %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl 16(%edi), %ecx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    movl 28(%edi), %eax
+; X32-NEXT:    imull %esi, %eax
+; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    movl 16(%edi), %ebp
 ; X32-NEXT:    movl 20(%edi), %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    imull %ebx, %edx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    imull %ebx, %edi
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edi
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    adcl %ecx, %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    adcl %ecx, %edi
+; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movzbl %bl, %esi
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, (%esi)
+; X32-NEXT:    movl %edi, (%ecx)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 4(%esi)
+; X32-NEXT:    movl %edi, 4(%ecx)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 8(%esi)
+; X32-NEXT:    movl %edi, 8(%ecx)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 12(%esi)
-; X32-NEXT:    movl %ebx, 16(%esi)
-; X32-NEXT:    movl %ecx, 20(%esi)
-; X32-NEXT:    movl %eax, 24(%esi)
-; X32-NEXT:    movl %edx, 28(%esi)
+; X32-NEXT:    movl %edi, 12(%ecx)
+; X32-NEXT:    movl %ebx, 16(%ecx)
+; X32-NEXT:    movl %esi, 20(%ecx)
+; X32-NEXT:    movl %eax, 24(%ecx)
+; X32-NEXT:    movl %edx, 28(%ecx)
 ; X32-NEXT:    addl $72, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 20
 ; X32-NEXT:    popl %esi
@@ -309,12 +311,9 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %r12
-; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 40
-; X64-NEXT:    .cfi_offset %rbx, -40
-; X64-NEXT:    .cfi_offset %r12, -32
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %rcx
@@ -330,16 +329,16 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    imulq %r14, %r10
-; X64-NEXT:    addq %r15, %r10
 ; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    imulq %r11, %r12
+; X64-NEXT:    addq %r15, %r10
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    imulq %r11, %r15
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r15, %rdx
 ; X64-NEXT:    movq 24(%rsi), %r15
 ; X64-NEXT:    imulq %rbx, %r15
-; X64-NEXT:    addq %r12, %r15
 ; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    addq %rdi, %r8
 ; X64-NEXT:    adcq %r10, %r15
@@ -372,8 +371,6 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    movq %rax, 16(%rcx)
 ; X64-NEXT:    movq %rdx, 24(%rcx)
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    popq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 24
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index f5254ed37b53d..c39c8337ca455 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -33,6 +33,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %ebp
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -46,74 +47,73 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 16(%ecx), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 16(%ecx), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl 20(%ecx), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 20(%ecx), %ebx
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebp, %esi
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 8(%edi), %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %ecx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl 12(%edi), %ecx
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -125,7 +125,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
@@ -145,7 +145,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %ecx, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -154,9 +154,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 8(%ecx), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%ecx), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -166,20 +166,19 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -192,66 +191,66 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 4(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    adcl %ebp, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -263,28 +262,27 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movl %ebp, %edx
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %edi, %eax
@@ -297,109 +295,111 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 16(%eax), %ebp
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl 16(%eax), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 20(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 24(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 24(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 28(%eax), %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl 28(%eax), %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -407,29 +407,31 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -445,20 +447,21 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -467,80 +470,81 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %edi, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %ebx
+; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -549,59 +553,58 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    adcl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 32(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -609,7 +612,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
@@ -621,7 +624,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl 36(%eax), %ecx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
@@ -649,7 +652,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
@@ -662,14 +665,14 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 40(%eax), %esi
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
@@ -708,7 +711,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -740,115 +743,120 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %edi
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edi
-; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 60(%edi), %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl 56(%edi), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl 56(%ebx), %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl 48(%edi), %ecx
-; X32-NEXT:    movl 52(%edi), %ebx
+; X32-NEXT:    movl 60(%ebx), %eax
+; X32-NEXT:    imull %ecx, %eax
+; X32-NEXT:    addl %eax, %esi
+; X32-NEXT:    movl 48(%ebx), %edi
+; X32-NEXT:    movl 52(%ebx), %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    imull %ebx, %edx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edi
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    imull %ebp, %ebx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ebx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -880,10 +888,10 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 32(%ecx), %esi
 ; X32-NEXT:    movl %esi, %eax
@@ -917,8 +925,8 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -951,20 +959,20 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
@@ -972,60 +980,60 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 48(%ecx), %ebp
+; X32-NEXT:    movl 48(%ecx), %edi
 ; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 52(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 56(%ecx), %eax
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %edx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl 60(%esi), %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    mull %edi
+; X32-NEXT:    imull %ecx, %esi
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
@@ -1042,43 +1050,43 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %esi, %edx
+; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %edi, %ebp
-; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %edi
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    imull %edi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
@@ -1097,9 +1105,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -1171,270 +1179,272 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %r14
+; X64-NEXT:    movq (%rdi), %rbx
 ; X64-NEXT:    movq 8(%rdi), %r9
-; X64-NEXT:    movq 24(%rdi), %r15
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rdi
-; X64-NEXT:    movq 8(%rsi), %rbx
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq 24(%rdi), %r12
+; X64-NEXT:    movq 16(%rdi), %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    movq 8(%rsi), %r11
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rcx, %r10
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r10, %rcx
+; X64-NEXT:    adcq %r8, %r14
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    addq %r14, %r10
 ; X64-NEXT:    adcq %rsi, %r13
-; X64-NEXT:    movq %r14, %rsi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %r14, %r15
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %rbp
+; X64-NEXT:    adcq %rbp, %rbx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r9, %rdi
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rbp, %rcx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
 ; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq %r11, %r14
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    adcq %rcx, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %r12, %r9
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%r12), %rsi
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq %rdi, %rsi
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%rdi), %r8
+; X64-NEXT:    movq %r12, %r11
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r9, %r12
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq 24(%rsi), %rsi
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r11, %rbp
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq 24(%r9), %rdi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r15, %r9
-; X64-NEXT:    setb %bpl
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r15, %r11
+; X64-NEXT:    adcq %rbx, %r9
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r9, %rcx
+; X64-NEXT:    movzbl %bl, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %rbp, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    addq %r10, %rcx
 ; X64-NEXT:    adcq %r13, %r15
-; X64-NEXT:    setb %bpl
+; X64-NEXT:    setb %r12b
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    adcq %r9, %rbp
+; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    adcq %r8, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rcx, %r14
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    addq %rbp, %rax
+; X64-NEXT:    movzbl %dil, %edi
+; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r15, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %bpl, %eax
-; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    movzbl %r12b, %ecx
+; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 32(%rcx), %r11
-; X64-NEXT:    imulq %r11, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq 32(%rcx), %r15
+; X64-NEXT:    imulq %r15, %rsi
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq 40(%rcx), %r8
-; X64-NEXT:    imulq %r8, %rsi
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    movq 40(%rcx), %rsi
+; X64-NEXT:    imulq %rsi, %r8
+; X64-NEXT:    addq %rdx, %r8
 ; X64-NEXT:    movq 48(%rcx), %rax
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %rcx
-; X64-NEXT:    movq 56(%rdx), %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    imulq %rdi, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    imulq %r14, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdx, %rbp
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    movq 56(%r11), %r11
+; X64-NEXT:    imulq %rbx, %r11
+; X64-NEXT:    addq %rdx, %r11
 ; X64-NEXT:    addq %r9, %rcx
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    adcq %r8, %r11
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rbx, %r8
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rdi, %rbx
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rbx, %r13
 ; X64-NEXT:    adcq %r9, %r15
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %r15, %r8
-; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    adcq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq %rbp, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 56(%rcx), %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rsi
-; X64-NEXT:    movq 48(%rcx), %r11
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    addq %rdx, %r11
-; X64-NEXT:    movq 32(%rdi), %r9
-; X64-NEXT:    movq 40(%rdi), %rdi
+; X64-NEXT:    adcq %r11, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    movq 48(%r9), %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    imulq %r14, %rsi
+; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    movq %r9, %rdx
+; X64-NEXT:    movq 56(%r9), %rax
+; X64-NEXT:    imulq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    movq 32(%r9), %r9
+; X64-NEXT:    movq 40(%rdx), %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    imulq %rdi, %rdx
-; X64-NEXT:    imulq %r9, %r13
-; X64-NEXT:    addq %rdx, %r13
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    imulq %r15, %r11
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdx, %r13
-; X64-NEXT:    addq %r10, %r15
-; X64-NEXT:    adcq %r11, %r13
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r11, %rdx
+; X64-NEXT:    imulq %r9, %r10
+; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r10
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r11, %rcx
-; X64-NEXT:    adcq %rbp, %r9
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movzbl %r11b, %edi
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    addq %r15, %rax
-; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq %rbx, %rcx
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rbx, %r9
+; X64-NEXT:    adcq %rbp, %rsi
+; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %bl, %esi
+; X64-NEXT:    adcq %rsi, %rdx
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    adcq %r10, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %r9
 ; X64-NEXT:    adcq %r8, %rax
 ; X64-NEXT:    adcq %r12, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, (%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 8(%rsi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, 16(%rsi)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, (%rdi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, 8(%rdi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, 16(%rdi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, 24(%rdi)
-; X64-NEXT:    movq %rsi, 32(%rdi)
-; X64-NEXT:    movq %rcx, 40(%rdi)
-; X64-NEXT:    movq %rax, 48(%rdi)
-; X64-NEXT:    movq %rdx, 56(%rdi)
+; X64-NEXT:    movq %rdi, 24(%rsi)
+; X64-NEXT:    movq %rcx, 32(%rsi)
+; X64-NEXT:    movq %r9, 40(%rsi)
+; X64-NEXT:    movq %rax, 48(%rsi)
+; X64-NEXT:    movq %rdx, 56(%rsi)
+; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index 0bc23af920eed..4e63580beb148 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -9,8 +9,8 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    imulq %rdi, %rcx
 ; X64-NEXT:    mulq %rdx
+; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    imulq %rsi, %r8
-; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    addq %r8, %rdx
 ; X64-NEXT:    retq
 ;
@@ -24,68 +24,71 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 32
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%esi)
+; X86-NEXT:    movl %eax, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/mul64.ll b/llvm/test/CodeGen/X86/mul64.ll
index d2afb2c529e42..1feed4b207a23 100644
--- a/llvm/test/CodeGen/X86/mul64.ll
+++ b/llvm/test/CodeGen/X86/mul64.ll
@@ -11,8 +11,8 @@ define i64 @foo(i64 %t, i64 %u) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index a184d49ce75a7..4ecc60dd1a4c5 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -13,52 +13,54 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    movq %rdi, %r9
+; CHECK-NEXT:    movq %rdx, %r11
+; CHECK-NEXT:    movq %rdi, %r10
 ; CHECK-NEXT:    movq %rsi, %rdi
 ; CHECK-NEXT:    sarq $63, %rdi
-; CHECK-NEXT:    movq %rcx, %r11
-; CHECK-NEXT:    imulq %rdi, %r11
+; CHECK-NEXT:    movq %rcx, %r8
+; CHECK-NEXT:    imulq %rdi, %r8
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    mulq %rdi
-; CHECK-NEXT:    movq %rax, %rdi
-; CHECK-NEXT:    addq %rax, %r11
-; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rdi
+; CHECK-NEXT:    addq %r8, %rdi
 ; CHECK-NEXT:    movq %rcx, %rax
 ; CHECK-NEXT:    sarq $63, %rax
 ; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    imulq %rsi, %r14
-; CHECK-NEXT:    mulq %r9
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    addq %rax, %r14
-; CHECK-NEXT:    addq %rdx, %r14
-; CHECK-NEXT:    addq %rdi, %r8
-; CHECK-NEXT:    adcq %r11, %r14
-; CHECK-NEXT:    movq %r9, %rax
 ; CHECK-NEXT:    mulq %r10
-; CHECK-NEXT:    movq %rdx, %r11
+; CHECK-NEXT:    movq %rax, %r9
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    addq %r14, %r8
+; CHECK-NEXT:    addq %rax, %r8
+; CHECK-NEXT:    addq %rbx, %r9
+; CHECK-NEXT:    adcq %rdi, %r8
+; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    mulq %r11
+; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    mulq %r10
-; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    addq %r11, %rbx
-; CHECK-NEXT:    adcq $0, %r10
-; CHECK-NEXT:    movq %r9, %rax
-; CHECK-NEXT:    mulq %rcx
+; CHECK-NEXT:    mulq %r11
 ; CHECK-NEXT:    movq %rdx, %r11
-; CHECK-NEXT:    movq %rax, %r9
-; CHECK-NEXT:    addq %rbx, %r9
-; CHECK-NEXT:    adcq %r10, %r11
+; CHECK-NEXT:    movq %rax, %r14
+; CHECK-NEXT:    addq %rbx, %r14
+; CHECK-NEXT:    adcq $0, %r11
+; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    mulq %rcx
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movq %rax, %r10
+; CHECK-NEXT:    addq %r14, %r10
+; CHECK-NEXT:    adcq %r11, %rbx
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movzbl %al, %r10d
+; CHECK-NEXT:    movzbl %al, %r11d
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    addq %r11, %rax
-; CHECK-NEXT:    adcq %r10, %rdx
-; CHECK-NEXT:    addq %r8, %rax
-; CHECK-NEXT:    adcq %r14, %rdx
-; CHECK-NEXT:    movq %r9, %rcx
+; CHECK-NEXT:    addq %rbx, %rax
+; CHECK-NEXT:    adcq %r11, %rdx
+; CHECK-NEXT:    addq %r9, %rax
+; CHECK-NEXT:    adcq %r8, %rdx
+; CHECK-NEXT:    movq %r10, %rcx
 ; CHECK-NEXT:    sarq $63, %rcx
 ; CHECK-NEXT:    xorq %rcx, %rdx
 ; CHECK-NEXT:    xorq %rax, %rcx
@@ -66,7 +68,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.2: ## %nooverflow
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %r9, %rdx
+; CHECK-NEXT:    movq %r10, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index 3e92b3321ba9c..b0739b6f47458 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -171,9 +171,10 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    jne LBB0_17
 ; CHECK-NEXT:  LBB0_18: ## %bb26
 ; CHECK-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT:    addl %ecx, %esi
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    addl %ecx, %edx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; CHECK-NEXT:    addl %esi, %edx
 ; CHECK-NEXT:    jmp LBB0_23
 ; CHECK-NEXT:  LBB0_19: ## %bb29
 ; CHECK-NEXT:    testl %edx, %edx
@@ -605,9 +606,15 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    jne LBB1_17
 ; CHECK-NEXT:  LBB1_18: ## %bb26
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    shrl %ecx
+; CHECK-NEXT:    subl $4, %esp
+; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:    pushl $128
+; CHECK-NEXT:    pushl %edx
 ; CHECK-NEXT:    jmp LBB1_23
 ; CHECK-NEXT:  LBB1_19: ## %bb29
 ; CHECK-NEXT:    testl %ebp, %ebp
@@ -638,12 +645,12 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:  LBB1_23: ## %bb33
 ; CHECK-NEXT:    shrl %eax
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    pushl $128
 ; CHECK-NEXT:    pushl %ecx
+; CHECK-NEXT:  LBB1_23: ## %bb33
 ; CHECK-NEXT:    calll _memset
 ; CHECK-NEXT:    addl $44, %esp
 ; CHECK-NEXT:  LBB1_25: ## %return

diff  --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index fb222cd2b2e10..ff3bdd2a22abd 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -352,23 +352,23 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %esi, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %esi
-; X86-NOSSE-NEXT:    shrl %esi
-; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %esi
-; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %esi
-; X86-NOSSE-NEXT:    shrl $4, %esi
-; X86-NOSSE-NEXT:    addl %ecx, %esi
-; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %esi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
@@ -420,18 +420,20 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    addl %edx, %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    movl %edx, (%eax)
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    addl %ecx, %esi
+; X86-POPCNT-NEXT:    addl %edx, %esi
+; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    popl %esi
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -798,80 +800,82 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    subl %eax, %ebx
-; X86-NOSSE-NEXT:    movl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    shrl %ecx
+; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %edi, %ecx
+; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %ebx, %ebp
+; X86-NOSSE-NEXT:    andl %ecx, %ebp
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %eax, %esi
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    movl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %esi, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %ebp # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ebp
-; X86-NOSSE-NEXT:    andl %esi, %ebx
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %ebp, %edi
-; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    addl %ebp, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
+; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    subl %ebx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %eax, %edx
-; X86-NOSSE-NEXT:    addl %ebx, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ebp
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    addl %ebx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %eax, %edi
+; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    addl %eax, %edi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ebp, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebp
 ; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %edx, %ebp
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    andl %esi, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    addl %esi, %ebp
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %eax
+; X86-NOSSE-NEXT:    subl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
+; X86-NOSSE-NEXT:    andl %ebx, %eax
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    subl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    andl %eax, %ecx
-; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    addl %ecx, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
-; X86-NOSSE-NEXT:    addl %ebx, %ecx
-; X86-NOSSE-NEXT:    andl %esi, %ecx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -920,19 +924,21 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    addl %edx, %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT:    addl %ecx, %edx
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    addl %ecx, %esi
+; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
 ; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT:    movl %edx, (%eax)
+; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    popl %esi
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1223,80 +1229,82 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    subl %eax, %ebx
-; X86-NOSSE-NEXT:    movl $858993459, %eax # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    shrl %ecx
+; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %edi, %ecx
+; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %ebx, %ebp
+; X86-NOSSE-NEXT:    andl %ecx, %ebp
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %eax, %esi
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    movl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %esi, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %ebp # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ebp
-; X86-NOSSE-NEXT:    andl %esi, %ebx
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %ebp, %edi
-; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    addl %ebp, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
+; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    subl %ebx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %eax, %edx
-; X86-NOSSE-NEXT:    addl %ebx, %edx
-; X86-NOSSE-NEXT:    movl %edx, %ebp
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    addl %ebx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %eax, %edi
+; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    addl %eax, %edi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ebp, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebp
 ; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %edx, %ebp
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    andl %esi, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    addl %esi, %ebp
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %eax
+; X86-NOSSE-NEXT:    subl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl %edx, %eax
+; X86-NOSSE-NEXT:    shrl $4, %eax
+; X86-NOSSE-NEXT:    addl %edx, %eax
+; X86-NOSSE-NEXT:    andl %ebx, %ebp
+; X86-NOSSE-NEXT:    andl %ebx, %eax
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %ebx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    subl %edi, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    andl %eax, %ecx
-; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %eax, %ebx
-; X86-NOSSE-NEXT:    addl %ecx, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
-; X86-NOSSE-NEXT:    addl %ebx, %ecx
-; X86-NOSSE-NEXT:    andl %esi, %ecx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1345,19 +1353,21 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    addl %edx, %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
-; X86-POPCNT-NEXT:    addl %ecx, %edx
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    addl %ecx, %esi
+; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
 ; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT:    movl %edx, (%eax)
+; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    popl %esi
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:

diff  --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index 1705edab9e82b..de34bfb13159c 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -31,6 +31,10 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    imull %edx
 ; CHECK-NEXT:    movl %edx, %edi
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    sarl $7, %edi
+; CHECK-NEXT:    addl %eax, %edi
 ; CHECK-NEXT:    imull $36525, %esi, %eax # imm = 0x8EAD
 ; CHECK-NEXT:    addl $172251900, %eax # imm = 0xA445AFC
 ; CHECK-NEXT:    movl $1374389535, %edx # imm = 0x51EB851F
@@ -39,11 +43,7 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    sarl $5, %edx
 ; CHECK-NEXT:    addl %eax, %edx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    addl 16(%ebx), %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    sarl $7, %edi
 ; CHECK-NEXT:    addl %edi, %ecx
 ; CHECK-NEXT:    leal 257(%ecx,%edx), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)

diff  --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index fb04f743cdfd5..066cbb6193317 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -104,11 +104,11 @@ define i32 @veccond512(<16 x i32> %input) {
 ; SSE2-LABEL: veccond512:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    je .LBB2_2
 ; SSE2-NEXT:  # %bb.1: # %if-true-block
@@ -121,9 +121,9 @@ define i32 @veccond512(<16 x i32> %input) {
 ; SSE41-LABEL: veccond512:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    je .LBB2_2
 ; SSE41-NEXT:  # %bb.1: # %if-true-block
 ; SSE41-NEXT:    xorl %eax, %eax
@@ -237,11 +237,11 @@ define i32 @vectest512(<16 x i32> %input) {
 ; SSE2-LABEL: vectest512:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
@@ -250,10 +250,10 @@ define i32 @vectest512(<16 x i32> %input) {
 ; SSE41-LABEL: vectest512:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
@@ -349,11 +349,11 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movl %edi, %eax
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %ecx
 ; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
 ; SSE2-NEXT:    cmovel %esi, %eax
 ; SSE2-NEXT:    retq
@@ -362,9 +362,9 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movl %edi, %eax
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    cmovel %esi, %eax
 ; SSE41-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/rev16.ll b/llvm/test/CodeGen/X86/rev16.ll
index fc9b606627fa0..ecb1970185c2d 100644
--- a/llvm/test/CodeGen/X86/rev16.ll
+++ b/llvm/test/CodeGen/X86/rev16.ll
@@ -58,28 +58,29 @@ define i32 @not_rev16(i32 %a) {
 define i32 @extra_maskop_uses2(i32 %a) {
 ; X86-LABEL: extra_maskop_uses2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shll $8, %edx
-; X86-NEXT:    shrl $8, %ecx
-; X86-NEXT:    andl $-16711936, %edx # imm = 0xFF00FF00
-; X86-NEXT:    andl $16711935, %ecx # imm = 0xFF00FF
-; X86-NEXT:    leal (%ecx,%edx), %eax
-; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    andl $-16711936, %ecx # imm = 0xFF00FF00
+; X86-NEXT:    andl $16711935, %eax # imm = 0xFF00FF
+; X86-NEXT:    leal (%eax,%ecx), %edx
 ; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    imull %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extra_maskop_uses2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $8, %eax
 ; X64-NEXT:    shrl $8, %edi
-; X64-NEXT:    andl $-16711936, %ecx # imm = 0xFF00FF00
+; X64-NEXT:    andl $-16711936, %eax # imm = 0xFF00FF00
 ; X64-NEXT:    andl $16711935, %edi # imm = 0xFF00FF
-; X64-NEXT:    leal (%rdi,%rcx), %eax
-; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    leal (%rdi,%rax), %ecx
 ; X64-NEXT:    imull %edi, %eax
+; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %l8 = shl i32 %a, 8
   %r8 = lshr i32 %a, 8

diff  --git a/llvm/test/CodeGen/X86/rotate-multi.ll b/llvm/test/CodeGen/X86/rotate-multi.ll
index 6334b4e5e1fa8..8b4c852fd7ef7 100644
--- a/llvm/test/CodeGen/X86/rotate-multi.ll
+++ b/llvm/test/CodeGen/X86/rotate-multi.ll
@@ -27,8 +27,8 @@ define i32 @f1(i32 %a0, i32 %a1) #0 {
 ; CHECK:       # %bb.0: # %b0
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    shll $7, %eax
+; CHECK-NEXT:    orl %esi, %eax
 ; CHECK-NEXT:    roll $9, %edi
-; CHECK-NEXT:    orl %esi, %edi
 ; CHECK-NEXT:    orl %edi, %eax
 ; CHECK-NEXT:    retq
 b0:
@@ -52,10 +52,10 @@ define i32 @f2(i32 %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    shrl $21, %edi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    shll $19, %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    shrl $13, %esi
 ; CHECK-NEXT:    orl %edi, %esi
 ; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    retq
   %v0 = shl i32 %a0, 11
   %v1 = lshr i32 %a0, 21
@@ -73,33 +73,33 @@ define i32 @f3(i32 %a0) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK:       # %bb.0: # %b0
 ; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal (,%rdi,8), %ecx
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $5, %eax
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shll $7, %edx
-; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    leal (,%rdi,8), %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    shll $5, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shll $13, %eax
-; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    shll $7, %eax
 ; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shll $19, %edx
+; CHECK-NEXT:    shll $13, %edx
 ; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    orl %ecx, %edx
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    movl %edi, %edx
-; CHECK-NEXT:    shrl $15, %edx
-; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    shll $19, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    shrl $2, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movl %edi, %esi
-; CHECK-NEXT:    shrl $23, %esi
+; CHECK-NEXT:    shrl $15, %esi
+; CHECK-NEXT:    orl %ecx, %esi
 ; CHECK-NEXT:    orl %edx, %esi
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    shrl $23, %ecx
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    shrl $25, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orl %ecx, %eax
 ; CHECK-NEXT:    shrl $30, %edi
 ; CHECK-NEXT:    orl %edi, %eax
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    orl %esi, %eax
 ; CHECK-NEXT:    retq
 b0:
   %v0 = shl i32 %a0, 3

diff  --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 1038665ccadac..3d3e935045475 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -52,8 +52,8 @@ define dso_local i32 @sad_16i8() nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -152,31 +152,31 @@ define dso_local i32 @sad_32i8() nounwind {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    addq $32, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: sad_32i8:
@@ -205,8 +205,8 @@ define dso_local i32 @sad_32i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -339,15 +339,15 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; SSE2-NEXT:    paddd %xmm5, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -355,10 +355,10 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ;
 ; AVX1-LABEL: sad_avx64i8:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -370,32 +370,32 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm5, %xmm5
 ; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm6
 ; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm6, %xmm6
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
 ; AVX1-NEXT:    addq $64, %rax
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm7
-; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm8
-; AVX1-NEXT:    vpaddd %xmm1, %xmm8, %xmm8
-; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm7
+; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm8
+; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm8
+; AVX1-NEXT:    vpaddd %xmm2, %xmm8, %xmm2
+; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm8, %xmm1
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -426,8 +426,8 @@ define dso_local i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddd %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -878,11 +878,11 @@ define dso_local i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <6
 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqu 32(%rdi), %xmm1
 ; SSE2-NEXT:    psadbw %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqu 48(%rdi), %xmm2
 ; SSE2-NEXT:    psadbw %xmm3, %xmm2
 ; SSE2-NEXT:    paddq %xmm0, %xmm2
 ; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    paddq %xmm4, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    movd %xmm0, %eax
@@ -898,8 +898,8 @@ define dso_local i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <6
 ; AVX1-NEXT:    vpsadbw 16(%rdx), %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw 32(%rdx), %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsadbw (%rdx), %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index f67d8e9aab27b..25c071940c52a 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -253,6 +253,7 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rdx
 ; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    orq %r11, %rdx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rsi
 ; SSE2-NEXT:    xorq %rdi, %rsi
@@ -261,18 +262,17 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-NEXT:    xorq %r8, %rdi
 ; SSE2-NEXT:    orq %rsi, %rdi
 ; SSE2-NEXT:    orq %rdx, %rdi
-; SSE2-NEXT:    orq %r11, %rdi
 ; SSE2-NEXT:    movq %xmm4, %rdx
 ; SSE2-NEXT:    xorq %r9, %rdx
 ; SSE2-NEXT:    movq %xmm6, %rsi
 ; SSE2-NEXT:    xorq %r10, %rsi
-; SSE2-NEXT:    movq %xmm5, %r8
-; SSE2-NEXT:    xorq %rcx, %r8
+; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    movq %xmm5, %rdx
+; SSE2-NEXT:    xorq %rcx, %rdx
 ; SSE2-NEXT:    movq %xmm7, %rcx
 ; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    orq %r8, %rcx
-; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    orq %rdi, %rcx
 ; SSE2-NEXT:    setne %al
@@ -280,38 +280,38 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ;
 ; SSE41-LABEL: ne_i512:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    movq %xmm2, %rcx
-; SSE41-NEXT:    movq %xmm1, %rdx
-; SSE41-NEXT:    movq %xmm3, %rsi
-; SSE41-NEXT:    pextrq $1, %xmm0, %rdi
-; SSE41-NEXT:    pextrq $1, %xmm2, %r8
-; SSE41-NEXT:    pextrq $1, %xmm1, %r9
-; SSE41-NEXT:    pextrq $1, %xmm3, %r10
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    movq %xmm2, %rdx
+; SSE41-NEXT:    movq %xmm1, %rsi
+; SSE41-NEXT:    movq %xmm3, %rdi
+; SSE41-NEXT:    pextrq $1, %xmm0, %r8
+; SSE41-NEXT:    pextrq $1, %xmm2, %r9
+; SSE41-NEXT:    pextrq $1, %xmm1, %r10
+; SSE41-NEXT:    pextrq $1, %xmm3, %rax
 ; SSE41-NEXT:    movq %xmm4, %r11
-; SSE41-NEXT:    xorq %rax, %r11
-; SSE41-NEXT:    movq %xmm6, %rax
-; SSE41-NEXT:    xorq %rcx, %rax
-; SSE41-NEXT:    movq %xmm5, %rcx
+; SSE41-NEXT:    xorq %rcx, %r11
+; SSE41-NEXT:    movq %xmm6, %rcx
 ; SSE41-NEXT:    xorq %rdx, %rcx
-; SSE41-NEXT:    movq %xmm7, %rdx
+; SSE41-NEXT:    orq %r11, %rcx
+; SSE41-NEXT:    movq %xmm5, %rdx
 ; SSE41-NEXT:    xorq %rsi, %rdx
-; SSE41-NEXT:    orq %rcx, %rdx
-; SSE41-NEXT:    orq %rax, %rdx
-; SSE41-NEXT:    orq %r11, %rdx
-; SSE41-NEXT:    pextrq $1, %xmm4, %rax
-; SSE41-NEXT:    xorq %rdi, %rax
-; SSE41-NEXT:    pextrq $1, %xmm6, %rcx
+; SSE41-NEXT:    movq %xmm7, %rsi
+; SSE41-NEXT:    xorq %rdi, %rsi
+; SSE41-NEXT:    orq %rdx, %rsi
+; SSE41-NEXT:    orq %rcx, %rsi
+; SSE41-NEXT:    pextrq $1, %xmm4, %rcx
 ; SSE41-NEXT:    xorq %r8, %rcx
-; SSE41-NEXT:    pextrq $1, %xmm5, %rsi
-; SSE41-NEXT:    xorq %r9, %rsi
+; SSE41-NEXT:    pextrq $1, %xmm6, %rdx
+; SSE41-NEXT:    xorq %r9, %rdx
+; SSE41-NEXT:    orq %rcx, %rdx
+; SSE41-NEXT:    pextrq $1, %xmm5, %rcx
+; SSE41-NEXT:    xorq %r10, %rcx
 ; SSE41-NEXT:    pextrq $1, %xmm7, %rdi
-; SSE41-NEXT:    xorq %r10, %rdi
-; SSE41-NEXT:    orq %rsi, %rdi
+; SSE41-NEXT:    xorq %rax, %rdi
 ; SSE41-NEXT:    orq %rcx, %rdi
-; SSE41-NEXT:    orq %rax, %rdi
-; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    orq %rdx, %rdi
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    orq %rsi, %rdi
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
@@ -331,6 +331,7 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX1-NEXT:    xorq %rdx, %r11
 ; AVX1-NEXT:    vmovq %xmm3, %rdx
 ; AVX1-NEXT:    xorq %rsi, %rdx
+; AVX1-NEXT:    orq %r11, %rdx
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rsi
 ; AVX1-NEXT:    xorq %rdi, %rsi
@@ -339,18 +340,17 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX1-NEXT:    xorq %r8, %rdi
 ; AVX1-NEXT:    orq %rsi, %rdi
 ; AVX1-NEXT:    orq %rdx, %rdi
-; AVX1-NEXT:    orq %r11, %rdi
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX1-NEXT:    xorq %r9, %rdx
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
 ; AVX1-NEXT:    xorq %r10, %rsi
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r8
-; AVX1-NEXT:    xorq %rcx, %r8
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    xorq %rcx, %rdx
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    orq %r8, %rcx
-; AVX1-NEXT:    orq %rsi, %rcx
 ; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    orq %rsi, %rcx
 ; AVX1-NEXT:    xorl %eax, %eax
 ; AVX1-NEXT:    orq %rdi, %rcx
 ; AVX1-NEXT:    setne %al
@@ -373,6 +373,7 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX2-NEXT:    xorq %rdx, %r11
 ; AVX2-NEXT:    vmovq %xmm3, %rdx
 ; AVX2-NEXT:    xorq %rsi, %rdx
+; AVX2-NEXT:    orq %r11, %rdx
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rsi
 ; AVX2-NEXT:    xorq %rdi, %rsi
@@ -381,18 +382,17 @@ define i32 @ne_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX2-NEXT:    xorq %r8, %rdi
 ; AVX2-NEXT:    orq %rsi, %rdi
 ; AVX2-NEXT:    orq %rdx, %rdi
-; AVX2-NEXT:    orq %r11, %rdi
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX2-NEXT:    xorq %r9, %rdx
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rsi
 ; AVX2-NEXT:    xorq %r10, %rsi
-; AVX2-NEXT:    vpextrq $1, %xmm0, %r8
-; AVX2-NEXT:    xorq %rcx, %r8
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    xorq %rcx, %rdx
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
 ; AVX2-NEXT:    xorq %rax, %rcx
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    orq %rdi, %rcx
 ; AVX2-NEXT:    setne %al
@@ -444,6 +444,7 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rdx
 ; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    orq %r11, %rdx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
 ; SSE2-NEXT:    movq %xmm0, %rsi
 ; SSE2-NEXT:    xorq %rdi, %rsi
@@ -452,18 +453,17 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; SSE2-NEXT:    xorq %r8, %rdi
 ; SSE2-NEXT:    orq %rsi, %rdi
 ; SSE2-NEXT:    orq %rdx, %rdi
-; SSE2-NEXT:    orq %r11, %rdi
 ; SSE2-NEXT:    movq %xmm4, %rdx
 ; SSE2-NEXT:    xorq %r9, %rdx
 ; SSE2-NEXT:    movq %xmm6, %rsi
 ; SSE2-NEXT:    xorq %r10, %rsi
-; SSE2-NEXT:    movq %xmm5, %r8
-; SSE2-NEXT:    xorq %rcx, %r8
+; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    movq %xmm5, %rdx
+; SSE2-NEXT:    xorq %rcx, %rdx
 ; SSE2-NEXT:    movq %xmm7, %rcx
 ; SSE2-NEXT:    xorq %rax, %rcx
-; SSE2-NEXT:    orq %r8, %rcx
-; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    orq %rsi, %rcx
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    orq %rdi, %rcx
 ; SSE2-NEXT:    sete %al
@@ -471,38 +471,38 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ;
 ; SSE41-LABEL: eq_i512:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    movq %xmm2, %rcx
-; SSE41-NEXT:    movq %xmm1, %rdx
-; SSE41-NEXT:    movq %xmm3, %rsi
-; SSE41-NEXT:    pextrq $1, %xmm0, %rdi
-; SSE41-NEXT:    pextrq $1, %xmm2, %r8
-; SSE41-NEXT:    pextrq $1, %xmm1, %r9
-; SSE41-NEXT:    pextrq $1, %xmm3, %r10
+; SSE41-NEXT:    movq %xmm0, %rcx
+; SSE41-NEXT:    movq %xmm2, %rdx
+; SSE41-NEXT:    movq %xmm1, %rsi
+; SSE41-NEXT:    movq %xmm3, %rdi
+; SSE41-NEXT:    pextrq $1, %xmm0, %r8
+; SSE41-NEXT:    pextrq $1, %xmm2, %r9
+; SSE41-NEXT:    pextrq $1, %xmm1, %r10
+; SSE41-NEXT:    pextrq $1, %xmm3, %rax
 ; SSE41-NEXT:    movq %xmm4, %r11
-; SSE41-NEXT:    xorq %rax, %r11
-; SSE41-NEXT:    movq %xmm6, %rax
-; SSE41-NEXT:    xorq %rcx, %rax
-; SSE41-NEXT:    movq %xmm5, %rcx
+; SSE41-NEXT:    xorq %rcx, %r11
+; SSE41-NEXT:    movq %xmm6, %rcx
 ; SSE41-NEXT:    xorq %rdx, %rcx
-; SSE41-NEXT:    movq %xmm7, %rdx
+; SSE41-NEXT:    orq %r11, %rcx
+; SSE41-NEXT:    movq %xmm5, %rdx
 ; SSE41-NEXT:    xorq %rsi, %rdx
-; SSE41-NEXT:    orq %rcx, %rdx
-; SSE41-NEXT:    orq %rax, %rdx
-; SSE41-NEXT:    orq %r11, %rdx
-; SSE41-NEXT:    pextrq $1, %xmm4, %rax
-; SSE41-NEXT:    xorq %rdi, %rax
-; SSE41-NEXT:    pextrq $1, %xmm6, %rcx
+; SSE41-NEXT:    movq %xmm7, %rsi
+; SSE41-NEXT:    xorq %rdi, %rsi
+; SSE41-NEXT:    orq %rdx, %rsi
+; SSE41-NEXT:    orq %rcx, %rsi
+; SSE41-NEXT:    pextrq $1, %xmm4, %rcx
 ; SSE41-NEXT:    xorq %r8, %rcx
-; SSE41-NEXT:    pextrq $1, %xmm5, %rsi
-; SSE41-NEXT:    xorq %r9, %rsi
+; SSE41-NEXT:    pextrq $1, %xmm6, %rdx
+; SSE41-NEXT:    xorq %r9, %rdx
+; SSE41-NEXT:    orq %rcx, %rdx
+; SSE41-NEXT:    pextrq $1, %xmm5, %rcx
+; SSE41-NEXT:    xorq %r10, %rcx
 ; SSE41-NEXT:    pextrq $1, %xmm7, %rdi
-; SSE41-NEXT:    xorq %r10, %rdi
-; SSE41-NEXT:    orq %rsi, %rdi
+; SSE41-NEXT:    xorq %rax, %rdi
 ; SSE41-NEXT:    orq %rcx, %rdi
-; SSE41-NEXT:    orq %rax, %rdi
-; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    orq %rdx, %rdi
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    orq %rsi, %rdi
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -522,6 +522,7 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX1-NEXT:    xorq %rdx, %r11
 ; AVX1-NEXT:    vmovq %xmm3, %rdx
 ; AVX1-NEXT:    xorq %rsi, %rdx
+; AVX1-NEXT:    orq %r11, %rdx
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rsi
 ; AVX1-NEXT:    xorq %rdi, %rsi
@@ -530,18 +531,17 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX1-NEXT:    xorq %r8, %rdi
 ; AVX1-NEXT:    orq %rsi, %rdi
 ; AVX1-NEXT:    orq %rdx, %rdi
-; AVX1-NEXT:    orq %r11, %rdi
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX1-NEXT:    xorq %r9, %rdx
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
 ; AVX1-NEXT:    xorq %r10, %rsi
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r8
-; AVX1-NEXT:    xorq %rcx, %r8
+; AVX1-NEXT:    orq %rdx, %rsi
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    xorq %rcx, %rdx
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    orq %r8, %rcx
-; AVX1-NEXT:    orq %rsi, %rcx
 ; AVX1-NEXT:    orq %rdx, %rcx
+; AVX1-NEXT:    orq %rsi, %rcx
 ; AVX1-NEXT:    xorl %eax, %eax
 ; AVX1-NEXT:    orq %rdi, %rcx
 ; AVX1-NEXT:    sete %al
@@ -564,6 +564,7 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX2-NEXT:    xorq %rdx, %r11
 ; AVX2-NEXT:    vmovq %xmm3, %rdx
 ; AVX2-NEXT:    xorq %rsi, %rdx
+; AVX2-NEXT:    orq %r11, %rdx
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rsi
 ; AVX2-NEXT:    xorq %rdi, %rsi
@@ -572,18 +573,17 @@ define i32 @eq_i512(<8 x i64> %x, <8 x i64> %y) {
 ; AVX2-NEXT:    xorq %r8, %rdi
 ; AVX2-NEXT:    orq %rsi, %rdi
 ; AVX2-NEXT:    orq %rdx, %rdi
-; AVX2-NEXT:    orq %r11, %rdi
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rdx
 ; AVX2-NEXT:    xorq %r9, %rdx
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rsi
 ; AVX2-NEXT:    xorq %r10, %rsi
-; AVX2-NEXT:    vpextrq $1, %xmm0, %r8
-; AVX2-NEXT:    xorq %rcx, %r8
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    xorq %rcx, %rdx
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
 ; AVX2-NEXT:    xorq %rax, %rcx
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    orq %rdi, %rcx
 ; AVX2-NEXT:    sete %al
@@ -736,28 +736,28 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) {
 define i32 @ne_i256_pair(ptr %a, ptr %b) {
 ; SSE2-LABEL: ne_i256_pair:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 16(%rdi), %rcx
-; SSE2-NEXT:    movq 24(%rdi), %rdx
-; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    movq (%rdi), %rdx
 ; SSE2-NEXT:    movq 8(%rdi), %r8
 ; SSE2-NEXT:    xorq 8(%rsi), %r8
-; SSE2-NEXT:    xorq 24(%rsi), %rdx
-; SSE2-NEXT:    xorq (%rsi), %rax
-; SSE2-NEXT:    xorq 16(%rsi), %rcx
+; SSE2-NEXT:    xorq 24(%rsi), %rcx
+; SSE2-NEXT:    xorq (%rsi), %rdx
+; SSE2-NEXT:    xorq 16(%rsi), %rax
 ; SSE2-NEXT:    movq 48(%rdi), %r9
 ; SSE2-NEXT:    movq 32(%rdi), %r10
 ; SSE2-NEXT:    movq 56(%rdi), %r11
 ; SSE2-NEXT:    movq 40(%rdi), %rdi
 ; SSE2-NEXT:    xorq 40(%rsi), %rdi
+; SSE2-NEXT:    orq %r8, %rdi
 ; SSE2-NEXT:    xorq 56(%rsi), %r11
-; SSE2-NEXT:    orq %rdx, %r11
+; SSE2-NEXT:    orq %rcx, %r11
 ; SSE2-NEXT:    orq %rdi, %r11
-; SSE2-NEXT:    orq %r8, %r11
 ; SSE2-NEXT:    xorq 32(%rsi), %r10
+; SSE2-NEXT:    orq %rdx, %r10
 ; SSE2-NEXT:    xorq 48(%rsi), %r9
-; SSE2-NEXT:    orq %rcx, %r9
-; SSE2-NEXT:    orq %r10, %r9
 ; SSE2-NEXT:    orq %rax, %r9
+; SSE2-NEXT:    orq %r10, %r9
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    orq %r11, %r9
 ; SSE2-NEXT:    setne %al
@@ -765,28 +765,28 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) {
 ;
 ; SSE41-LABEL: ne_i256_pair:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq 16(%rdi), %rcx
-; SSE41-NEXT:    movq 24(%rdi), %rdx
-; SSE41-NEXT:    movq (%rdi), %rax
+; SSE41-NEXT:    movq 16(%rdi), %rax
+; SSE41-NEXT:    movq 24(%rdi), %rcx
+; SSE41-NEXT:    movq (%rdi), %rdx
 ; SSE41-NEXT:    movq 8(%rdi), %r8
 ; SSE41-NEXT:    xorq 8(%rsi), %r8
-; SSE41-NEXT:    xorq 24(%rsi), %rdx
-; SSE41-NEXT:    xorq (%rsi), %rax
-; SSE41-NEXT:    xorq 16(%rsi), %rcx
+; SSE41-NEXT:    xorq 24(%rsi), %rcx
+; SSE41-NEXT:    xorq (%rsi), %rdx
+; SSE41-NEXT:    xorq 16(%rsi), %rax
 ; SSE41-NEXT:    movq 48(%rdi), %r9
 ; SSE41-NEXT:    movq 32(%rdi), %r10
 ; SSE41-NEXT:    movq 56(%rdi), %r11
 ; SSE41-NEXT:    movq 40(%rdi), %rdi
 ; SSE41-NEXT:    xorq 40(%rsi), %rdi
+; SSE41-NEXT:    orq %r8, %rdi
 ; SSE41-NEXT:    xorq 56(%rsi), %r11
-; SSE41-NEXT:    orq %rdx, %r11
+; SSE41-NEXT:    orq %rcx, %r11
 ; SSE41-NEXT:    orq %rdi, %r11
-; SSE41-NEXT:    orq %r8, %r11
 ; SSE41-NEXT:    xorq 32(%rsi), %r10
+; SSE41-NEXT:    orq %rdx, %r10
 ; SSE41-NEXT:    xorq 48(%rsi), %r9
-; SSE41-NEXT:    orq %rcx, %r9
-; SSE41-NEXT:    orq %r10, %r9
 ; SSE41-NEXT:    orq %rax, %r9
+; SSE41-NEXT:    orq %r10, %r9
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    orq %r11, %r9
 ; SSE41-NEXT:    setne %al
@@ -850,28 +850,28 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) {
 define i32 @eq_i256_pair(ptr %a, ptr %b) {
 ; SSE2-LABEL: eq_i256_pair:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 16(%rdi), %rcx
-; SSE2-NEXT:    movq 24(%rdi), %rdx
-; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 16(%rdi), %rax
+; SSE2-NEXT:    movq 24(%rdi), %rcx
+; SSE2-NEXT:    movq (%rdi), %rdx
 ; SSE2-NEXT:    movq 8(%rdi), %r8
 ; SSE2-NEXT:    xorq 8(%rsi), %r8
-; SSE2-NEXT:    xorq 24(%rsi), %rdx
-; SSE2-NEXT:    xorq (%rsi), %rax
-; SSE2-NEXT:    xorq 16(%rsi), %rcx
+; SSE2-NEXT:    xorq 24(%rsi), %rcx
+; SSE2-NEXT:    xorq (%rsi), %rdx
+; SSE2-NEXT:    xorq 16(%rsi), %rax
 ; SSE2-NEXT:    movq 48(%rdi), %r9
 ; SSE2-NEXT:    movq 32(%rdi), %r10
 ; SSE2-NEXT:    movq 56(%rdi), %r11
 ; SSE2-NEXT:    movq 40(%rdi), %rdi
 ; SSE2-NEXT:    xorq 40(%rsi), %rdi
+; SSE2-NEXT:    orq %r8, %rdi
 ; SSE2-NEXT:    xorq 56(%rsi), %r11
-; SSE2-NEXT:    orq %rdx, %r11
+; SSE2-NEXT:    orq %rcx, %r11
 ; SSE2-NEXT:    orq %rdi, %r11
-; SSE2-NEXT:    orq %r8, %r11
 ; SSE2-NEXT:    xorq 32(%rsi), %r10
+; SSE2-NEXT:    orq %rdx, %r10
 ; SSE2-NEXT:    xorq 48(%rsi), %r9
-; SSE2-NEXT:    orq %rcx, %r9
-; SSE2-NEXT:    orq %r10, %r9
 ; SSE2-NEXT:    orq %rax, %r9
+; SSE2-NEXT:    orq %r10, %r9
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    orq %r11, %r9
 ; SSE2-NEXT:    sete %al
@@ -879,28 +879,28 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) {
 ;
 ; SSE41-LABEL: eq_i256_pair:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq 16(%rdi), %rcx
-; SSE41-NEXT:    movq 24(%rdi), %rdx
-; SSE41-NEXT:    movq (%rdi), %rax
+; SSE41-NEXT:    movq 16(%rdi), %rax
+; SSE41-NEXT:    movq 24(%rdi), %rcx
+; SSE41-NEXT:    movq (%rdi), %rdx
 ; SSE41-NEXT:    movq 8(%rdi), %r8
 ; SSE41-NEXT:    xorq 8(%rsi), %r8
-; SSE41-NEXT:    xorq 24(%rsi), %rdx
-; SSE41-NEXT:    xorq (%rsi), %rax
-; SSE41-NEXT:    xorq 16(%rsi), %rcx
+; SSE41-NEXT:    xorq 24(%rsi), %rcx
+; SSE41-NEXT:    xorq (%rsi), %rdx
+; SSE41-NEXT:    xorq 16(%rsi), %rax
 ; SSE41-NEXT:    movq 48(%rdi), %r9
 ; SSE41-NEXT:    movq 32(%rdi), %r10
 ; SSE41-NEXT:    movq 56(%rdi), %r11
 ; SSE41-NEXT:    movq 40(%rdi), %rdi
 ; SSE41-NEXT:    xorq 40(%rsi), %rdi
+; SSE41-NEXT:    orq %r8, %rdi
 ; SSE41-NEXT:    xorq 56(%rsi), %r11
-; SSE41-NEXT:    orq %rdx, %r11
+; SSE41-NEXT:    orq %rcx, %r11
 ; SSE41-NEXT:    orq %rdi, %r11
-; SSE41-NEXT:    orq %r8, %r11
 ; SSE41-NEXT:    xorq 32(%rsi), %r10
+; SSE41-NEXT:    orq %rdx, %r10
 ; SSE41-NEXT:    xorq 48(%rsi), %r9
-; SSE41-NEXT:    orq %rcx, %r9
-; SSE41-NEXT:    orq %r10, %r9
 ; SSE41-NEXT:    orq %rax, %r9
+; SSE41-NEXT:    orq %r10, %r9
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    orq %r11, %r9
 ; SSE41-NEXT:    sete %al
@@ -964,54 +964,54 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) {
 define i32 @ne_i512_pair(ptr %a, ptr %b) {
 ; NO512-LABEL: ne_i512_pair:
 ; NO512:       # %bb.0:
-; NO512-NEXT:    movq 32(%rdi), %rax
-; NO512-NEXT:    movq 48(%rdi), %rcx
-; NO512-NEXT:    movq 40(%rdi), %rdx
-; NO512-NEXT:    movq 56(%rdi), %r8
-; NO512-NEXT:    xorq 56(%rsi), %r8
-; NO512-NEXT:    movq 120(%rdi), %r9
-; NO512-NEXT:    xorq 120(%rsi), %r9
-; NO512-NEXT:    orq %r8, %r9
+; NO512-NEXT:    movq 40(%rdi), %rax
+; NO512-NEXT:    movq 56(%rdi), %rcx
+; NO512-NEXT:    movq 24(%rdi), %rdx
+; NO512-NEXT:    xorq 24(%rsi), %rdx
+; NO512-NEXT:    xorq 56(%rsi), %rcx
 ; NO512-NEXT:    movq 88(%rdi), %r8
 ; NO512-NEXT:    xorq 88(%rsi), %r8
-; NO512-NEXT:    orq %r8, %r9
-; NO512-NEXT:    movq 24(%rdi), %r8
-; NO512-NEXT:    xorq 24(%rsi), %r8
-; NO512-NEXT:    xorq 40(%rsi), %rdx
-; NO512-NEXT:    orq %r8, %r9
-; NO512-NEXT:    movq 104(%rdi), %r8
-; NO512-NEXT:    xorq 104(%rsi), %r8
 ; NO512-NEXT:    orq %rdx, %r8
-; NO512-NEXT:    movq 72(%rdi), %rdx
-; NO512-NEXT:    xorq 72(%rsi), %rdx
+; NO512-NEXT:    movq 120(%rdi), %rdx
+; NO512-NEXT:    xorq 120(%rsi), %rdx
+; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    movq 8(%rdi), %rcx
+; NO512-NEXT:    xorq 8(%rsi), %rcx
+; NO512-NEXT:    xorq 40(%rsi), %rax
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    movq 72(%rdi), %r8
+; NO512-NEXT:    xorq 72(%rsi), %r8
+; NO512-NEXT:    orq %rcx, %r8
+; NO512-NEXT:    movq 104(%rdi), %rcx
+; NO512-NEXT:    xorq 104(%rsi), %rcx
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 48(%rdi), %rax
+; NO512-NEXT:    orq %r8, %rcx
+; NO512-NEXT:    movq 16(%rdi), %r8
+; NO512-NEXT:    xorq 16(%rsi), %r8
+; NO512-NEXT:    xorq 48(%rsi), %rax
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 80(%rdi), %rdx
+; NO512-NEXT:    xorq 80(%rsi), %rdx
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    movq 112(%rdi), %r8
+; NO512-NEXT:    xorq 112(%rsi), %r8
+; NO512-NEXT:    orq %rax, %r8
+; NO512-NEXT:    movq (%rdi), %rax
+; NO512-NEXT:    xorq (%rsi), %rax
 ; NO512-NEXT:    orq %rdx, %r8
-; NO512-NEXT:    movq 16(%rdi), %rdx
-; NO512-NEXT:    orq %r9, %r8
-; NO512-NEXT:    movq 8(%rdi), %r9
-; NO512-NEXT:    xorq 8(%rsi), %r9
-; NO512-NEXT:    xorq 48(%rsi), %rcx
-; NO512-NEXT:    orq %r9, %r8
-; NO512-NEXT:    movq 112(%rdi), %r9
-; NO512-NEXT:    xorq 112(%rsi), %r9
-; NO512-NEXT:    orq %rcx, %r9
-; NO512-NEXT:    movq 80(%rdi), %rcx
-; NO512-NEXT:    xorq 80(%rsi), %rcx
-; NO512-NEXT:    orq %rcx, %r9
-; NO512-NEXT:    movq (%rdi), %rcx
-; NO512-NEXT:    xorq 16(%rsi), %rdx
-; NO512-NEXT:    xorq (%rsi), %rcx
-; NO512-NEXT:    xorq 32(%rsi), %rax
-; NO512-NEXT:    orq %rdx, %r9
-; NO512-NEXT:    movq 96(%rdi), %rdx
-; NO512-NEXT:    movq 64(%rdi), %rdi
-; NO512-NEXT:    xorq 64(%rsi), %rdi
-; NO512-NEXT:    xorq 96(%rsi), %rdx
+; NO512-NEXT:    movq 64(%rdi), %rdx
+; NO512-NEXT:    xorq 64(%rsi), %rdx
 ; NO512-NEXT:    orq %rax, %rdx
-; NO512-NEXT:    orq %rdi, %rdx
-; NO512-NEXT:    orq %r9, %rdx
-; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    movq 32(%rdi), %rax
+; NO512-NEXT:    xorq 32(%rsi), %rax
+; NO512-NEXT:    movq 96(%rdi), %rdi
+; NO512-NEXT:    xorq 96(%rsi), %rdi
+; NO512-NEXT:    orq %rax, %rdi
+; NO512-NEXT:    orq %rdx, %rdi
+; NO512-NEXT:    orq %r8, %rdi
 ; NO512-NEXT:    xorl %eax, %eax
-; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    orq %rcx, %rdi
 ; NO512-NEXT:    setne %al
 ; NO512-NEXT:    retq
 ;
@@ -1058,54 +1058,54 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) {
 define i32 @eq_i512_pair(ptr %a, ptr %b) {
 ; NO512-LABEL: eq_i512_pair:
 ; NO512:       # %bb.0:
-; NO512-NEXT:    movq 32(%rdi), %rax
-; NO512-NEXT:    movq 48(%rdi), %rcx
-; NO512-NEXT:    movq 40(%rdi), %rdx
-; NO512-NEXT:    movq 56(%rdi), %r8
-; NO512-NEXT:    xorq 56(%rsi), %r8
-; NO512-NEXT:    movq 120(%rdi), %r9
-; NO512-NEXT:    xorq 120(%rsi), %r9
-; NO512-NEXT:    orq %r8, %r9
+; NO512-NEXT:    movq 40(%rdi), %rax
+; NO512-NEXT:    movq 56(%rdi), %rcx
+; NO512-NEXT:    movq 24(%rdi), %rdx
+; NO512-NEXT:    xorq 24(%rsi), %rdx
+; NO512-NEXT:    xorq 56(%rsi), %rcx
 ; NO512-NEXT:    movq 88(%rdi), %r8
 ; NO512-NEXT:    xorq 88(%rsi), %r8
-; NO512-NEXT:    orq %r8, %r9
-; NO512-NEXT:    movq 24(%rdi), %r8
-; NO512-NEXT:    xorq 24(%rsi), %r8
-; NO512-NEXT:    xorq 40(%rsi), %rdx
-; NO512-NEXT:    orq %r8, %r9
-; NO512-NEXT:    movq 104(%rdi), %r8
-; NO512-NEXT:    xorq 104(%rsi), %r8
 ; NO512-NEXT:    orq %rdx, %r8
-; NO512-NEXT:    movq 72(%rdi), %rdx
-; NO512-NEXT:    xorq 72(%rsi), %rdx
+; NO512-NEXT:    movq 120(%rdi), %rdx
+; NO512-NEXT:    xorq 120(%rsi), %rdx
+; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    movq 8(%rdi), %rcx
+; NO512-NEXT:    xorq 8(%rsi), %rcx
+; NO512-NEXT:    xorq 40(%rsi), %rax
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    movq 72(%rdi), %r8
+; NO512-NEXT:    xorq 72(%rsi), %r8
+; NO512-NEXT:    orq %rcx, %r8
+; NO512-NEXT:    movq 104(%rdi), %rcx
+; NO512-NEXT:    xorq 104(%rsi), %rcx
+; NO512-NEXT:    orq %rax, %rcx
+; NO512-NEXT:    movq 48(%rdi), %rax
+; NO512-NEXT:    orq %r8, %rcx
+; NO512-NEXT:    movq 16(%rdi), %r8
+; NO512-NEXT:    xorq 16(%rsi), %r8
+; NO512-NEXT:    xorq 48(%rsi), %rax
+; NO512-NEXT:    orq %rdx, %rcx
+; NO512-NEXT:    movq 80(%rdi), %rdx
+; NO512-NEXT:    xorq 80(%rsi), %rdx
+; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    movq 112(%rdi), %r8
+; NO512-NEXT:    xorq 112(%rsi), %r8
+; NO512-NEXT:    orq %rax, %r8
+; NO512-NEXT:    movq (%rdi), %rax
+; NO512-NEXT:    xorq (%rsi), %rax
 ; NO512-NEXT:    orq %rdx, %r8
-; NO512-NEXT:    movq 16(%rdi), %rdx
-; NO512-NEXT:    orq %r9, %r8
-; NO512-NEXT:    movq 8(%rdi), %r9
-; NO512-NEXT:    xorq 8(%rsi), %r9
-; NO512-NEXT:    xorq 48(%rsi), %rcx
-; NO512-NEXT:    orq %r9, %r8
-; NO512-NEXT:    movq 112(%rdi), %r9
-; NO512-NEXT:    xorq 112(%rsi), %r9
-; NO512-NEXT:    orq %rcx, %r9
-; NO512-NEXT:    movq 80(%rdi), %rcx
-; NO512-NEXT:    xorq 80(%rsi), %rcx
-; NO512-NEXT:    orq %rcx, %r9
-; NO512-NEXT:    movq (%rdi), %rcx
-; NO512-NEXT:    xorq 16(%rsi), %rdx
-; NO512-NEXT:    xorq (%rsi), %rcx
-; NO512-NEXT:    xorq 32(%rsi), %rax
-; NO512-NEXT:    orq %rdx, %r9
-; NO512-NEXT:    movq 96(%rdi), %rdx
-; NO512-NEXT:    movq 64(%rdi), %rdi
-; NO512-NEXT:    xorq 64(%rsi), %rdi
-; NO512-NEXT:    xorq 96(%rsi), %rdx
+; NO512-NEXT:    movq 64(%rdi), %rdx
+; NO512-NEXT:    xorq 64(%rsi), %rdx
 ; NO512-NEXT:    orq %rax, %rdx
-; NO512-NEXT:    orq %rdi, %rdx
-; NO512-NEXT:    orq %r9, %rdx
-; NO512-NEXT:    orq %rcx, %rdx
+; NO512-NEXT:    movq 32(%rdi), %rax
+; NO512-NEXT:    xorq 32(%rsi), %rax
+; NO512-NEXT:    movq 96(%rdi), %rdi
+; NO512-NEXT:    xorq 96(%rsi), %rdi
+; NO512-NEXT:    orq %rax, %rdi
+; NO512-NEXT:    orq %rdx, %rdi
+; NO512-NEXT:    orq %r8, %rdi
 ; NO512-NEXT:    xorl %eax, %eax
-; NO512-NEXT:    orq %r8, %rdx
+; NO512-NEXT:    orq %rcx, %rdi
 ; NO512-NEXT:    sete %al
 ; NO512-NEXT:    retq
 ;
@@ -1184,16 +1184,16 @@ define i1 @eq_i512_args(i512 %a, i512 %b) {
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
 ; ANY-NEXT:    orq %r10, %rcx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
-; ANY-NEXT:    orq %rcx, %r9
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
 ; ANY-NEXT:    orq %r9, %rsi
+; ANY-NEXT:    orq %rcx, %rsi
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
 ; ANY-NEXT:    orq %rax, %rdx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT:    orq %rdx, %r8
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
 ; ANY-NEXT:    orq %r8, %rdi
+; ANY-NEXT:    orq %rdx, %rdi
 ; ANY-NEXT:    orq %rsi, %rdi
 ; ANY-NEXT:    sete %al
 ; ANY-NEXT:    retq
@@ -1252,18 +1252,18 @@ define i1 @eq_i512_op(i512 %a, i512 %b) {
 ; ANY-NEXT:    adcq $0, %rax
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
+; ANY-NEXT:    orq %rsi, %r9
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
 ; ANY-NEXT:    orq %rcx, %rax
 ; ANY-NEXT:    orq %r9, %rax
-; ANY-NEXT:    orq %rsi, %rax
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
 ; ANY-NEXT:    orq %rdx, %r10
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT:    orq %r10, %r8
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
 ; ANY-NEXT:    orq %r8, %rdi
+; ANY-NEXT:    orq %r10, %rdi
 ; ANY-NEXT:    orq %rax, %rdi
 ; ANY-NEXT:    sete %al
 ; ANY-NEXT:    retq
@@ -1313,14 +1313,14 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
 ; ANY-NEXT:    orq %r8, %r11
 ; ANY-NEXT:    xorq 8(%rdi), %rdx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT:    orq %r11, %rax
 ; ANY-NEXT:    orq %rdx, %rax
+; ANY-NEXT:    orq %r11, %rax
 ; ANY-NEXT:    xorq 32(%rdi), %r9
 ; ANY-NEXT:    xorq (%rdi), %rsi
+; ANY-NEXT:    orq %r9, %rsi
 ; ANY-NEXT:    xorq 16(%rdi), %rcx
 ; ANY-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
 ; ANY-NEXT:    orq %rcx, %r10
-; ANY-NEXT:    orq %r9, %r10
 ; ANY-NEXT:    orq %rsi, %r10
 ; ANY-NEXT:    orq %rax, %r10
 ; ANY-NEXT:    sete %al

diff  --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 815c96540e45c..ab504d0a43fef 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -630,8 +630,8 @@ define i32 @logic_tree_with_mismatching_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %
 ; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    shll $15, %edi
 ; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    orl %esi, %edi
 ; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
 ; X64-NEXT:    orl %edi, %eax
 ; X64-NEXT:    retq
   %a.shifted = shl i32 %a, 15
@@ -658,8 +658,8 @@ define i32 @logic_tree_with_mismatching_shifts2_i32(i32 %a, i32 %b, i32 %c, i32
 ; X64-NEXT:    movl %edx, %eax
 ; X64-NEXT:    shll $16, %edi
 ; X64-NEXT:    shrl $16, %eax
+; X64-NEXT:    orl %esi, %edi
 ; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
 ; X64-NEXT:    orl %edi, %eax
 ; X64-NEXT:    retq
   %a.shifted = shl i32 %a, 16
@@ -777,8 +777,8 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 ; X64:       # %bb.0:
 ; X64-NEXT:    pslld $16, %xmm0
 ; X64-NEXT:    pslld $17, %xmm2
+; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    por %xmm3, %xmm2
-; X64-NEXT:    por %xmm1, %xmm2
 ; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>

diff  --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 6d8b83824a6d5..fb7bc96d5bf09 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,7 +191,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $192, %esp
+; X86-NEXT:    subl $188, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
@@ -229,7 +229,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -254,14 +254,14 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    addl %edx, %esi
@@ -297,7 +297,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -329,7 +329,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -363,9 +363,9 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
@@ -381,29 +381,31 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -415,10 +417,10 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -427,46 +429,45 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %ebp
 ; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -479,11 +480,11 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movzbl %cl, %ebx
 ; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -505,7 +506,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
@@ -559,7 +560,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
@@ -582,7 +583,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -601,10 +602,10 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -612,8 +613,8 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, %esi
@@ -621,7 +622,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movzbl %bl, %edi
 ; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -655,7 +656,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
@@ -674,125 +675,132 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %al, %ebp
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    xorl %edx, %ebp
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    xorl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    xorl %edx, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %ebp, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    andl $1, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    xorl %edi, %eax
 ; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -805,7 +813,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movb %cl, 16(%eax)
 ; X86-NEXT:    setne 20(%eax)
-; X86-NEXT:    addl $192, %esp
+; X86-NEXT:    addl $188, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -973,9 +981,9 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    xorq %rcx, %r10
 ; X64-NEXT:    orq %rdx, %r10
 ; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    orq %r10, %rax
 ; X64-NEXT:    xorq %rbx, %rcx
 ; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    orq %r10, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movl %eax, %esi
 ; X64-NEXT:    andl $1, %esi

diff  --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index fd25842180922..8c2b945d6a8ce 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -231,8 +231,8 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index fc5a54b3cf4ce..75fc1d34fa1cb 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -376,65 +376,66 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    xorl $2147483647, %edi # imm = 0x7FFFFFFF
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    cmovel %ecx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ebp, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 7113aaf7e83ed..fbbc857a38b4e 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -14,56 +14,58 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %rbx, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rdi, %r11
 ; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    movq %rcx, %rbx
-; X64-NEXT:    imulq %rdi, %rbx
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    imulq %rdi, %r9
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    addq %r9, %rdi
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    imulq %rsi, %r15
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    addq %rdx, %r15
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq %rbx, %r15
-; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    addq %r15, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    addq %r14, %r10
+; X64-NEXT:    adcq %rdi, %r9
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rbx, %r14
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r14, %r10
-; X64-NEXT:    adcq %r11, %rbx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %r14, %r15
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r15, %r11
+; X64-NEXT:    adcq %rbx, %r14
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r11d
+; X64-NEXT:    movzbl %al, %ebx
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    adcq %r11, %rdx
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    adcq %r15, %rdx
-; X64-NEXT:    movq %r10, 8(%r8)
-; X64-NEXT:    sarq $63, %r10
-; X64-NEXT:    xorq %r10, %rdx
-; X64-NEXT:    xorq %rax, %r10
-; X64-NEXT:    orq %rdx, %r10
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    adcq %r9, %rdx
+; X64-NEXT:    movq %r11, 8(%r8)
+; X64-NEXT:    sarq $63, %r11
+; X64-NEXT:    xorq %r11, %rdx
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    orq %rdx, %r11
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    movq %rdi, (%r8)
 ; X64-NEXT:    popq %rbx
@@ -88,217 +90,218 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull %eax, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    xorl %esi, %eax
-; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %ebp, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -340,36 +343,36 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rcx, %r13
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rsi, %r11
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rbx, %r14
-; X64-NEXT:    adcq %rcx, %r12
+; X64-NEXT:    adcq %rsi, %r12
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movzbl %al, %r10d
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r12, %rsi
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    adcq %r10, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
@@ -386,168 +389,171 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    adcq %r12, %rbx
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rbx, %rbp
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r13
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    addq %r10, %rbp
+; X64-NEXT:    addq %r15, %rbp
 ; X64-NEXT:    adcq %r14, %r13
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rdi, %r10
+; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    adcq %r10, %r11
-; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r10, %r14
+; X64-NEXT:    adcq %r9, %r11
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r11, %r8
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    addq %rbp, %r14
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r11, %r9
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %rdi
+; X64-NEXT:    addq %rbp, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r13, %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r13, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
-; X64-NEXT:    setb %cl
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq %rcx, %rdi
+; X64-NEXT:    setb %bl
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    adcq %rdi, %r11
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r11, %r13
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %r11, %r15
 ; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %r13
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r15, %r8
-; X64-NEXT:    sarq $63, %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    addq %r9, %rbp
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    adcq %rax, %r15
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %r14, %rdi
+; X64-NEXT:    movq %r14, %rbp
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r9, %r14
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r8, %r9
-; X64-NEXT:    imulq %r12, %r9
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r9, %r8
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r9, %r10
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    imulq %r12, %rsi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq {{[0-9]+}}(%rsp)
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    addq %rdx, %r9
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    addq %r11, %r14
-; X64-NEXT:    movzbl %sil, %edi
-; X64-NEXT:    adcq %rcx, %rdi
-; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    adcq %r9, %rdi
+; X64-NEXT:    addq %rsi, %rdx
+; X64-NEXT:    addq %rax, %rdx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %r8, %rdx
+; X64-NEXT:    addq %r11, %r10
+; X64-NEXT:    movzbl %cl, %esi
+; X64-NEXT:    adcq %r14, %rsi
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    adcq %rdx, %rsi
 ; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rdx, %r14
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    addq %rax, %r14
 ; X64-NEXT:    adcq %rdx, %r11
 ; X64-NEXT:    setb %bl
-; X64-NEXT:    imulq %r12, %r15
+; X64-NEXT:    imulq %r12, %rbp
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    addq %rdx, %r15
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    adcq %rcx, %r15
+; X64-NEXT:    addq %rax, %rdx
+; X64-NEXT:    addq %rbp, %rdx
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    adcq %r14, %rdx
 ; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    movzbl %bl, %edx
-; X64-NEXT:    adcq %r8, %rdx
+; X64-NEXT:    movzbl %bl, %r9d
+; X64-NEXT:    adcq %rdi, %r9
 ; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    adcq %r15, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r10, %rcx
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    adcq %rdx, %r9
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r8, %r14
+; X64-NEXT:    adcq %r10, %r11
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, %r11
+; X64-NEXT:    adcq %r13, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    xorq %rax, %rdx
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    xorq %rax, %r9
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    orq %r9, %r14
 ; X64-NEXT:    xorq %rax, %r11
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    orq %r11, %rax
-; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    orq %r14, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rdi, 24(%rax)
+; X64-NEXT:    movq %rdx, 24(%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -579,59 +585,60 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
@@ -640,65 +647,65 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
@@ -706,28 +713,29 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
@@ -741,23 +749,23 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
@@ -766,57 +774,57 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
@@ -826,15 +834,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
@@ -858,111 +866,111 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
@@ -978,72 +986,71 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
@@ -1051,72 +1058,71 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
@@ -1135,57 +1141,56 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1214,11 +1219,12 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -1226,25 +1232,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
@@ -1252,210 +1258,213 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movzbl %bl, %ebx
 ; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb %dl
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 28(%eax)
+; X86-NEXT:    movl %ebp, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload

diff  --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 7a3e48572b8e2..0226052402cb8 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -196,7 +196,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -207,37 +207,36 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32-NEXT:    subl %esi, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    subl %ecx, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imull %eax, %ecx
-; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull %ebx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %ebx, %eax
+; WIN32-NEXT:    addl %ecx, %eax
 ; WIN32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; WIN32-NEXT:    subl %ebp, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull %edx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    imull %ebp, %edi
 ; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %ebp, %eax
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    addl %eax, %edi
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull %eax, %edx
+; WIN32-NEXT:    addl %edx, %edi
 ; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -271,18 +270,18 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN64-NEXT:    # kill: def $r11d killed $r11d killed $r11
 ; WIN64-NEXT:    subl %r12d, %r11d
 ; WIN64-NEXT:    imull %edx, %r11d
+; WIN64-NEXT:    addl %r9d, %r11d
 ; WIN64-NEXT:    leal (%r14,%r15), %edx
-; WIN64-NEXT:    # kill: def $r14d killed $r14d killed $r14
-; WIN64-NEXT:    subl %r15d, %r14d
-; WIN64-NEXT:    imull %esi, %r14d
-; WIN64-NEXT:    addl %r11d, %r14d
+; WIN64-NEXT:    movl %r14d, %r9d
+; WIN64-NEXT:    subl %r15d, %r9d
+; WIN64-NEXT:    imull %esi, %r9d
+; WIN64-NEXT:    addl %r11d, %r9d
 ; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    imull %r8d, %eax
 ; WIN64-NEXT:    imull %ebx, %r10d
+; WIN64-NEXT:    addl %r10d, %eax
 ; WIN64-NEXT:    imull %edi, %edx
-; WIN64-NEXT:    addl %r10d, %edx
 ; WIN64-NEXT:    addl %edx, %eax
-; WIN64-NEXT:    addl %r14d, %eax
 ; WIN64-NEXT:    addl %r9d, %eax
 ; WIN64-NEXT:    popq %rbx
 ; WIN64-NEXT:    retq
@@ -312,19 +311,19 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; LINUXOSX-NEXT:    leal (%r13,%r14), %r11d
 ; LINUXOSX-NEXT:    movl %r13d, %r12d
 ; LINUXOSX-NEXT:    subl %r14d, %r12d
-; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; LINUXOSX-NEXT:    imull %edx, %r12d
-; LINUXOSX-NEXT:    movl %r15d, %edx
-; LINUXOSX-NEXT:    subl %r14d, %edx
-; LINUXOSX-NEXT:    imull %esi, %edx
-; LINUXOSX-NEXT:    addl %r12d, %edx
+; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX-NEXT:    addl %r9d, %r12d
+; LINUXOSX-NEXT:    movl %r15d, %r9d
+; LINUXOSX-NEXT:    subl %edx, %r9d
+; LINUXOSX-NEXT:    imull %esi, %r9d
+; LINUXOSX-NEXT:    addl %r12d, %r9d
 ; LINUXOSX-NEXT:    addl %ecx, %eax
 ; LINUXOSX-NEXT:    imull %r8d, %eax
 ; LINUXOSX-NEXT:    imull %r10d, %r11d
-; LINUXOSX-NEXT:    addl %r15d, %r14d
-; LINUXOSX-NEXT:    imull %edi, %r14d
-; LINUXOSX-NEXT:    addl %r11d, %r14d
-; LINUXOSX-NEXT:    addl %r14d, %eax
+; LINUXOSX-NEXT:    addl %r11d, %eax
+; LINUXOSX-NEXT:    addl %r15d, %edx
+; LINUXOSX-NEXT:    imull %edi, %edx
 ; LINUXOSX-NEXT:    addl %edx, %eax
 ; LINUXOSX-NEXT:    addl %r9d, %eax
 ; LINUXOSX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll
index 833b13e796787..b5b9ce95a46ba 100644
--- a/llvm/test/CodeGen/X86/stack-clash-large.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-large.ll
@@ -98,13 +98,13 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X64-NEXT:    .cfi_def_cfa_offset 71888
 ; CHECK-X64-NEXT:    .cfi_offset %rax, -16
 ; CHECK-X64-NEXT:    movl 71888(%rsp), %eax
+; CHECK-X64-NEXT:    addl %esi, %edi
 ; CHECK-X64-NEXT:    addl %ecx, %edx
+; CHECK-X64-NEXT:    addl %edi, %edx
+; CHECK-X64-NEXT:    addl %r9d, %r8d
 ; CHECK-X64-NEXT:    addl 71896(%rsp), %eax
-; CHECK-X64-NEXT:    addl %esi, %edx
-; CHECK-X64-NEXT:    addl %r9d, %eax
 ; CHECK-X64-NEXT:    addl %r8d, %eax
 ; CHECK-X64-NEXT:    addl %edx, %eax
-; CHECK-X64-NEXT:    addl %edi, %eax
 ; CHECK-X64-NEXT:    movl %eax, 264(%rsp)
 ; CHECK-X64-NEXT:    movl %eax, 28664(%rsp)
 ; CHECK-X64-NEXT:    addq $71872, %rsp # imm = 0x118C0
@@ -141,16 +141,16 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X86-NEXT:    .cfi_offset %edx, -12
 ; CHECK-X86-NEXT:    .cfi_offset %esi, -8
 ; CHECK-X86-NEXT:    movl 72056(%esp), %eax
-; CHECK-X86-NEXT:    movl 72048(%esp), %ecx
-; CHECK-X86-NEXT:    movl 72040(%esp), %edx
+; CHECK-X86-NEXT:    movl 72048(%esp), %edx
+; CHECK-X86-NEXT:    movl 72040(%esp), %ecx
 ; CHECK-X86-NEXT:    movl 72032(%esp), %esi
 ; CHECK-X86-NEXT:    addl 72036(%esp), %esi
-; CHECK-X86-NEXT:    addl 72044(%esp), %edx
-; CHECK-X86-NEXT:    addl 72052(%esp), %ecx
+; CHECK-X86-NEXT:    addl 72044(%esp), %ecx
+; CHECK-X86-NEXT:    addl %esi, %ecx
+; CHECK-X86-NEXT:    addl 72052(%esp), %edx
 ; CHECK-X86-NEXT:    addl 72060(%esp), %eax
-; CHECK-X86-NEXT:    addl %ecx, %eax
 ; CHECK-X86-NEXT:    addl %edx, %eax
-; CHECK-X86-NEXT:    addl %esi, %eax
+; CHECK-X86-NEXT:    addl %ecx, %eax
 ; CHECK-X86-NEXT:    movl %eax, 392(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 28792(%esp)
 ; CHECK-X86-NEXT:    addl $72012, %esp # imm = 0x1194C
@@ -184,13 +184,13 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 71888
 ; CHECK-X32-NEXT:    .cfi_offset %rax, -16
 ; CHECK-X32-NEXT:    movl 71888(%esp), %eax
+; CHECK-X32-NEXT:    addl %esi, %edi
 ; CHECK-X32-NEXT:    addl %ecx, %edx
+; CHECK-X32-NEXT:    addl %edi, %edx
+; CHECK-X32-NEXT:    addl %r9d, %r8d
 ; CHECK-X32-NEXT:    addl 71896(%esp), %eax
-; CHECK-X32-NEXT:    addl %esi, %edx
-; CHECK-X32-NEXT:    addl %r9d, %eax
 ; CHECK-X32-NEXT:    addl %r8d, %eax
 ; CHECK-X32-NEXT:    addl %edx, %eax
-; CHECK-X32-NEXT:    addl %edi, %eax
 ; CHECK-X32-NEXT:    movl %eax, 264(%esp)
 ; CHECK-X32-NEXT:    movl %eax, 28664(%esp)
 ; CHECK-X32-NEXT:    addl $71872, %esp # imm = 0x118C0

diff  --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index 84ace4b5b0950..d43c0b93136a5 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -442,82 +442,79 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl %edi, %ebp
-; CHECK-NEXT:    movl %esi, %ebx
-; CHECK-NEXT:    movl %edx, %r12d
-; CHECK-NEXT:    movl %ecx, %r13d
-; CHECK-NEXT:    movl %r8d, %r14d
-; CHECK-NEXT:    movl %r9d, %r15d
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %r8d, %r12d
+; CHECK-NEXT:    movl %r9d, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
 ; CHECK-NEXT:  Ltmp13:
-; CHECK-NEXT:    addq %r12, %rbx
-; CHECK-NEXT:    addq %r13, %rbx
-; CHECK-NEXT:    addq %r14, %rbx
-; CHECK-NEXT:    addq %r15, %rbx
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; CHECK-NEXT:    addq %rax, %r14
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; CHECK-NEXT:    addq %r14, %r12
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r12
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r15
+; CHECK-NEXT:    addq %r12, %r15
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
+; CHECK-NEXT:    addq %rbx, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
+; CHECK-NEXT:    addq %rbp, %r13
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    addq %rbp, %rbx
-; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    addq %r13, %rcx
+; CHECK-NEXT:    movq %rcx, %rax
 ; CHECK-NEXT:    addq $168, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12

diff  --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index d6fef28bc8c66..60a3d94ab23b1 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -554,82 +554,79 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl %edi, %ebp
-; CHECK-NEXT:    movl %esi, %ebx
-; CHECK-NEXT:    movl %edx, %r12d
-; CHECK-NEXT:    movl %ecx, %r13d
-; CHECK-NEXT:    movl %r8d, %r14d
-; CHECK-NEXT:    movl %r9d, %r15d
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %edx, %r14d
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl %r8d, %r12d
+; CHECK-NEXT:    movl %r9d, %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
 ; CHECK-NEXT:  Ltmp14:
-; CHECK-NEXT:    addq %r12, %rbx
-; CHECK-NEXT:    addq %r13, %rbx
-; CHECK-NEXT:    addq %r14, %rbx
-; CHECK-NEXT:    addq %r15, %rbx
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; CHECK-NEXT:    addq %rax, %r14
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; CHECK-NEXT:    addq %r14, %r12
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r12
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r15
+; CHECK-NEXT:    addq %r12, %r15
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rbp
+; CHECK-NEXT:    addq %rbx, %rbp
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %r13
+; CHECK-NEXT:    addq %rbp, %r13
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    addq %rbp, %rbx
-; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    addq %r13, %rcx
+; CHECK-NEXT:    movq %rcx, %rax
 ; CHECK-NEXT:    addq $168, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12

diff  --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll
index 24c1509b2325e..f9b73d4eaf92c 100644
--- a/llvm/test/CodeGen/X86/swift-return.ll
+++ b/llvm/test/CodeGen/X86/swift-return.ll
@@ -147,9 +147,9 @@ define dso_local i32 @test3(i32 %key) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    callq gen3 at PLT
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    # kill: def $r8d killed $r8d def $r8
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %r8d, %ecx
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    popq %rcx
@@ -360,7 +360,7 @@ define swiftcc { double, i64 } @test6() #0 {
 ; CHECK-NEXT:    addsd %xmm1, %xmm0
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
 ; CHECK-NEXT:    addsd %xmm3, %xmm0
-; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %r8, %rcx
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    popq %rcx

diff  --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index fcebba2e90fef..ccd27ddcae584 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -93,7 +93,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
@@ -106,7 +106,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -142,12 +142,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
@@ -176,13 +176,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
@@ -201,120 +201,123 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %esi, %edx
@@ -365,7 +368,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
@@ -377,23 +380,23 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -404,45 +407,45 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -453,15 +456,15 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %ebx, %esi
@@ -498,9 +501,9 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %esi, 8(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 12(%edx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 16(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 16(%edx)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 20(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 24(%edx)
@@ -527,14 +530,15 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %r10, %rbp
 ; X64-NEXT:    movq %rdx, %r14
@@ -549,13 +553,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    adcq %r14, %r12
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %r10d
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r12, %r13
 ; X64-NEXT:    adcq %r10, %r15
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r14
@@ -565,7 +569,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %r12, %r10
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %r10, %r12
@@ -579,7 +583,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
@@ -590,12 +594,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    imulq %r10, %r8
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    imulq %r10, %rcx
+; X64-NEXT:    addq %rdx, %rcx
 ; X64-NEXT:    addq %r14, %r15
 ; X64-NEXT:    adcq %r12, %rax
-; X64-NEXT:    adcq %r11, %r8
-; X64-NEXT:    imulq %r9, %rcx
+; X64-NEXT:    adcq %r11, %rcx
+; X64-NEXT:    imulq %r9, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rsi

diff  --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index dca010064b587..cb4bdd1ede75c 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -185,8 +185,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 36e3749654f90..33f43c75cad3d 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -278,30 +278,28 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %bl
-; X86-NEXT:    andb %dl, %bl
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    setne %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bh
+; X86-NEXT:    seto %bl
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    seto %cl
-; X86-NEXT:    orb %bh, %cl
-; X86-NEXT:    leal (%edi,%eax), %esi
-; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    setb %ch
-; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    seto %ch
 ; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    cmovnel %ecx, %edx

diff  --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 644ecee4f1ca8..9800c116ea15f 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -19,13 +19,13 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    seto %r11b
 ; X64-NEXT:    orb %r10b, %r11b
+; X64-NEXT:    orb %r9b, %r11b
 ; X64-NEXT:    leaq (%rsi,%rax), %rcx
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
-; X64-NEXT:    orb %r9b, %cl
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: muloti_test:
@@ -47,24 +47,24 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    seto (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    leal (%ecx,%eax), %ecx
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    seto %bh
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -72,18 +72,18 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    leal (%ecx,%eax), %ecx
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    seto %bl
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -106,7 +106,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebp, %ebp
@@ -115,23 +115,21 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %ch
 ; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb %ch, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    orb (%esp), %bh # 1-byte Folded Reload
+; X86-NEXT:    orb %ch, %bh
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; X86-NEXT:    movb %bh, (%esp) # 1-byte Spill
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    setne %bh
-; X86-NEXT:    andb %cl, %bh
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    setne %ch
+; X86-NEXT:    andb %cl, %ch
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    setne %bl
+; X86-NEXT:    setne %bh
 ; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -141,13 +139,12 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    andb %bl, %al
+; X86-NEXT:    andb %bh, %al
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    orb (%esp), %al # 1-byte Folded Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; X86-NEXT:    orb %al, %bh
-; X86-NEXT:    andb $1, %bh
-; X86-NEXT:    movb %bh, 16(%ecx)
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20

diff  --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 3bbeec17c7a9e..132683cdb0f9e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -16,30 +16,28 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %bl
-; X86-NEXT:    andb %dl, %bl
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    setne %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bh
+; X86-NEXT:    seto %bl
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bh, %ch
+; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    orb %cl, %ch
 ; X86-NEXT:    leal (%edi,%eax), %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb %ch, %cl
-; X86-NEXT:    orb %bl, %cl
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
index 7a5198dd6a0fa..671d7c21013de 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
@@ -295,21 +295,19 @@ define i1 @t8_3_2(i8 %X) nounwind {
 define i1 @t64_3_2(i64 %X) nounwind {
 ; X86-LABEL: t64_3_2:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X86-NEXT:    imull $-1431655765, {{[0-9]+}}(%esp), %esi # imm = 0xAAAAAAAB
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    addl $-1431655766, %eax # imm = 0xAAAAAAAA
-; X86-NEXT:    adcl $-1431655766, %esi # imm = 0xAAAAAAAA
+; X86-NEXT:    adcl $-1431655766, %edx # imm = 0xAAAAAAAA
 ; X86-NEXT:    cmpl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    sbbl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    sbbl $1431655765, %edx # imm = 0x55555555
 ; X86-NEXT:    setb %al
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t64_3_2:

diff  --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index c3122b7920d6a..c2dd3539e71e1 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -235,9 +235,9 @@ define <8 x i32> @three_ands(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
@@ -247,9 +247,9 @@ define <8 x i32> @three_ands(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    retq
 entry:
@@ -271,8 +271,8 @@ define <8 x i32> @four_ands(<8 x float> %x) {
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
@@ -284,8 +284,8 @@ define <8 x i32> @four_ands(<8 x float> %x) {
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X64-NEXT:    retq
 ;
@@ -295,9 +295,9 @@ define <8 x i32> @four_ands(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
@@ -310,9 +310,9 @@ define <8 x i32> @four_ands(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
@@ -339,8 +339,8 @@ define <8 x i32> @five_ands(<8 x float> %x) {
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X32-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vandps %ymm0, %ymm1, %ymm0
@@ -354,8 +354,8 @@ define <8 x i32> @five_ands(<8 x float> %x) {
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vandps %ymm0, %ymm1, %ymm0
@@ -367,15 +367,15 @@ define <8 x i32> @five_ands(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
@@ -385,15 +385,15 @@ define <8 x i32> @five_ands(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vandps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    retq
 entry:
@@ -481,9 +481,9 @@ define <8 x i32> @three_or(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
@@ -493,9 +493,9 @@ define <8 x i32> @three_or(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    retq
 entry:
@@ -517,8 +517,8 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
 ;
@@ -530,8 +530,8 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vorps %ymm2, %ymm1, %ymm1
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X64-NEXT:    retq
 ;
@@ -541,9 +541,9 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
@@ -556,9 +556,9 @@ define <8 x i32> @four_or(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
@@ -585,8 +585,8 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
-; X32-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X32-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -600,8 +600,8 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
-; X64-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -613,15 +613,15 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
@@ -631,15 +631,15 @@ define <8 x i32> @five_or(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpleps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-AVX2-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT:    vorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    retq
 entry:
@@ -789,11 +789,11 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm0, %ymm3, %ymm0
+; X32-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; X32-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: five_or_and:
@@ -804,11 +804,11 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm0, %ymm3, %ymm0
+; X64-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; X64-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X32-AVX2-LABEL: five_or_and:
@@ -820,13 +820,13 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vandps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; X32-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: five_or_and:
@@ -838,13 +838,13 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
 ; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vandps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; X64-AVX2-NEXT:    retq
 entry:
   %cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -936,10 +936,10 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ; X32-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4
+; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm0, %ymm4, %ymm0
-; X32-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; X32-NEXT:    vandps %ymm0, %ymm3, %ymm0
 ; X32-NEXT:    vxorps %ymm0, %ymm2, %ymm0
 ; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
@@ -951,10 +951,10 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ; X64-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
+; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm0, %ymm4, %ymm0
-; X64-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; X64-NEXT:    vandps %ymm0, %ymm3, %ymm0
 ; X64-NEXT:    vxorps %ymm0, %ymm2, %ymm0
 ; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
@@ -967,12 +967,12 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm4
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm5, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vandps %ymm0, %ymm4, %ymm0
-; X32-AVX2-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vandps %ymm0, %ymm3, %ymm0
 ; X32-AVX2-NEXT:    vxorps %ymm0, %ymm2, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    retl
@@ -985,12 +985,12 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm4
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm5, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vandps %ymm0, %ymm4, %ymm0
-; X64-AVX2-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT:    vxorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vandps %ymm0, %ymm3, %ymm0
 ; X64-AVX2-NEXT:    vxorps %ymm0, %ymm2, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    retq
@@ -1015,12 +1015,12 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X32-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
 ; X32-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4
-; X32-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X32-NEXT:    vxorps %ymm1, %ymm3, %ymm1
 ; X32-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X32-NEXT:    vxorps %ymm1, %ymm2, %ymm1
 ; X32-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
 ; X32-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X32-NEXT:    retl
@@ -1032,12 +1032,12 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X64-NEXT:    vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4
-; X64-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT:    vxorps %ymm1, %ymm3, %ymm1
 ; X64-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT:    vxorps %ymm1, %ymm2, %ymm1
 ; X64-NEXT:    vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; X64-NEXT:    retq
@@ -1050,14 +1050,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X32-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
 ; X32-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm4
-; X32-AVX2-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT:    vxorps %ymm1, %ymm3, %ymm1
 ; X32-AVX2-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT:    vxorps %ymm1, %ymm2, %ymm1
 ; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
 ; X32-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -1071,14 +1071,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
 ; X64-AVX2-NEXT:    vcmpltps %ymm2, %ymm0, %ymm2
 ; X64-AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm4, %ymm0, %ymm4
-; X64-AVX2-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT:    vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorps %ymm1, %ymm3, %ymm1
 ; X64-AVX2-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT:    vxorps %ymm1, %ymm2, %ymm1
 ; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
 ; X64-AVX2-NEXT:    vcmpneqps %ymm2, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vorps %ymm0, %ymm1, %ymm0

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 094dd4b9a1aca..411d102e90f52 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3304,108 +3304,111 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    pushq %r13
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    movq %r8, %r14
-; SSE2-NEXT:    movq %rcx, %r13
+; SSE2-NEXT:    movq %r8, %r15
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rsi, %r11
 ; SSE2-NEXT:    movq %rdi, %r10
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    movq %r11, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %r9, %r15
-; SSE2-NEXT:    imulq %rcx, %r15
-; SSE2-NEXT:    movq %r14, %rax
-; SSE2-NEXT:    mulq %rcx
-; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    addq %rax, %r15
-; SSE2-NEXT:    addq %rdx, %r15
+; SSE2-NEXT:    movq %r11, %rdi
+; SSE2-NEXT:    sarq $63, %rdi
+; SSE2-NEXT:    movq %r9, %rbx
+; SSE2-NEXT:    imulq %rdi, %rbx
+; SSE2-NEXT:    movq %r15, %rax
+; SSE2-NEXT:    mulq %rdi
+; SSE2-NEXT:    movq %rdx, %rdi
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    addq %rax, %rdi
+; SSE2-NEXT:    addq %rbx, %rdi
 ; SSE2-NEXT:    movq %r9, %rax
 ; SSE2-NEXT:    sarq $63, %rax
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    imulq %r11, %rcx
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    imulq %r11, %r13
 ; SSE2-NEXT:    mulq %r10
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    addq %rax, %rcx
-; SSE2-NEXT:    addq %rdx, %rcx
-; SSE2-NEXT:    addq %rdi, %rbx
-; SSE2-NEXT:    adcq %r15, %rcx
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    movq %rdx, %rbx
+; SSE2-NEXT:    addq %r13, %rbx
+; SSE2-NEXT:    addq %rax, %rbx
+; SSE2-NEXT:    addq %r12, %r14
+; SSE2-NEXT:    adcq %rdi, %rbx
 ; SSE2-NEXT:    movq %r10, %rax
-; SSE2-NEXT:    mulq %r14
-; SSE2-NEXT:    movq %rdx, %r15
+; SSE2-NEXT:    mulq %r15
+; SSE2-NEXT:    movq %rdx, %r12
 ; SSE2-NEXT:    movq %rax, %rdi
 ; SSE2-NEXT:    movq %r11, %rax
-; SSE2-NEXT:    mulq %r14
-; SSE2-NEXT:    movq %rdx, %r14
-; SSE2-NEXT:    movq %rax, %r12
-; SSE2-NEXT:    addq %r15, %r12
-; SSE2-NEXT:    adcq $0, %r14
+; SSE2-NEXT:    mulq %r15
+; SSE2-NEXT:    movq %rdx, %r15
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    addq %r12, %r13
+; SSE2-NEXT:    adcq $0, %r15
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %r9
-; SSE2-NEXT:    movq %rdx, %r15
+; SSE2-NEXT:    movq %rdx, %r12
 ; SSE2-NEXT:    movq %rax, %r10
-; SSE2-NEXT:    addq %r12, %r10
-; SSE2-NEXT:    adcq %r14, %r15
+; SSE2-NEXT:    addq %r13, %r10
+; SSE2-NEXT:    adcq %r15, %r12
 ; SSE2-NEXT:    setb %al
-; SSE2-NEXT:    movzbl %al, %r14d
+; SSE2-NEXT:    movzbl %al, %r15d
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r9
-; SSE2-NEXT:    addq %r15, %rax
-; SSE2-NEXT:    adcq %r14, %rdx
-; SSE2-NEXT:    addq %rbx, %rax
-; SSE2-NEXT:    adcq %rcx, %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    movq %r10, 8(%r15)
+; SSE2-NEXT:    addq %r12, %rax
+; SSE2-NEXT:    adcq %r15, %rdx
+; SSE2-NEXT:    addq %r14, %rax
+; SSE2-NEXT:    adcq %rbx, %rdx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT:    movq %r10, 8(%r12)
 ; SSE2-NEXT:    sarq $63, %r10
 ; SSE2-NEXT:    xorq %r10, %rdx
 ; SSE2-NEXT:    xorq %rax, %r10
-; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    xorl %r15d, %r15d
 ; SSE2-NEXT:    orq %rdx, %r10
-; SSE2-NEXT:    setne %cl
-; SSE2-NEXT:    movq %r13, %r9
+; SSE2-NEXT:    setne %r15b
+; SSE2-NEXT:    movq %rcx, %r9
 ; SSE2-NEXT:    sarq $63, %r9
 ; SSE2-NEXT:    movq %rbp, %r11
 ; SSE2-NEXT:    imulq %r9, %r11
 ; SSE2-NEXT:    movq %rsi, %rax
 ; SSE2-NEXT:    mulq %r9
-; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %rax, %r11
-; SSE2-NEXT:    addq %rdx, %r11
+; SSE2-NEXT:    movq %rdx, %r9
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    addq %rax, %r9
+; SSE2-NEXT:    addq %r11, %r9
 ; SSE2-NEXT:    movq %rbp, %rax
 ; SSE2-NEXT:    sarq $63, %rax
 ; SSE2-NEXT:    movq %rax, %r14
-; SSE2-NEXT:    imulq %r13, %r14
+; SSE2-NEXT:    imulq %rcx, %r14
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %r10
-; SSE2-NEXT:    addq %rax, %r14
-; SSE2-NEXT:    addq %rdx, %r14
-; SSE2-NEXT:    addq %r9, %r10
-; SSE2-NEXT:    adcq %r11, %r14
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    movq %rdx, %rbx
+; SSE2-NEXT:    addq %r14, %rbx
+; SSE2-NEXT:    addq %rax, %rbx
+; SSE2-NEXT:    addq %r10, %r11
+; SSE2-NEXT:    adcq %r9, %rbx
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %r9
-; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    movq %r13, %rax
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movq %rcx, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    addq %r9, %rbx
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    addq %r9, %r14
 ; SSE2-NEXT:    adcq $0, %rsi
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %rbx, %r9
+; SSE2-NEXT:    addq %r14, %r9
 ; SSE2-NEXT:    adcq %rsi, %r8
 ; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    movzbl %al, %esi
-; SSE2-NEXT:    movq %r13, %rax
+; SSE2-NEXT:    movq %rcx, %rax
 ; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    addq %r8, %rax
 ; SSE2-NEXT:    adcq %rsi, %rdx
-; SSE2-NEXT:    addq %r10, %rax
-; SSE2-NEXT:    adcq %r14, %rdx
-; SSE2-NEXT:    movq %r9, 24(%r15)
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    adcq %rbx, %rdx
+; SSE2-NEXT:    movq %r9, 24(%r12)
 ; SSE2-NEXT:    sarq $63, %r9
 ; SSE2-NEXT:    xorq %r9, %rdx
 ; SSE2-NEXT:    xorq %rax, %r9
@@ -3414,11 +3417,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    negl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    negl %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    negl %r15d
+; SSE2-NEXT:    movd %r15d, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %r11, 16(%r15)
-; SSE2-NEXT:    movq %rdi, (%r15)
+; SSE2-NEXT:    movq %r10, 16(%r12)
+; SSE2-NEXT:    movq %rdi, (%r12)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -3435,108 +3438,111 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    pushq %r13
 ; SSSE3-NEXT:    pushq %r12
 ; SSSE3-NEXT:    pushq %rbx
-; SSSE3-NEXT:    movq %r8, %r14
-; SSSE3-NEXT:    movq %rcx, %r13
+; SSSE3-NEXT:    movq %r8, %r15
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rsi, %r11
 ; SSSE3-NEXT:    movq %rdi, %r10
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT:    movq %r11, %rcx
-; SSSE3-NEXT:    sarq $63, %rcx
-; SSSE3-NEXT:    movq %r9, %r15
-; SSSE3-NEXT:    imulq %rcx, %r15
-; SSSE3-NEXT:    movq %r14, %rax
-; SSSE3-NEXT:    mulq %rcx
-; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    addq %rax, %r15
-; SSSE3-NEXT:    addq %rdx, %r15
+; SSSE3-NEXT:    movq %r11, %rdi
+; SSSE3-NEXT:    sarq $63, %rdi
+; SSSE3-NEXT:    movq %r9, %rbx
+; SSSE3-NEXT:    imulq %rdi, %rbx
+; SSSE3-NEXT:    movq %r15, %rax
+; SSSE3-NEXT:    mulq %rdi
+; SSSE3-NEXT:    movq %rdx, %rdi
+; SSSE3-NEXT:    movq %rax, %r12
+; SSSE3-NEXT:    addq %rax, %rdi
+; SSSE3-NEXT:    addq %rbx, %rdi
 ; SSSE3-NEXT:    movq %r9, %rax
 ; SSSE3-NEXT:    sarq $63, %rax
-; SSSE3-NEXT:    movq %rax, %rcx
-; SSSE3-NEXT:    imulq %r11, %rcx
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    imulq %r11, %r13
 ; SSSE3-NEXT:    mulq %r10
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    addq %rax, %rcx
-; SSSE3-NEXT:    addq %rdx, %rcx
-; SSSE3-NEXT:    addq %rdi, %rbx
-; SSSE3-NEXT:    adcq %r15, %rcx
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    movq %rdx, %rbx
+; SSSE3-NEXT:    addq %r13, %rbx
+; SSSE3-NEXT:    addq %rax, %rbx
+; SSSE3-NEXT:    addq %r12, %r14
+; SSSE3-NEXT:    adcq %rdi, %rbx
 ; SSSE3-NEXT:    movq %r10, %rax
-; SSSE3-NEXT:    mulq %r14
-; SSSE3-NEXT:    movq %rdx, %r15
+; SSSE3-NEXT:    mulq %r15
+; SSSE3-NEXT:    movq %rdx, %r12
 ; SSSE3-NEXT:    movq %rax, %rdi
 ; SSSE3-NEXT:    movq %r11, %rax
-; SSSE3-NEXT:    mulq %r14
-; SSSE3-NEXT:    movq %rdx, %r14
-; SSSE3-NEXT:    movq %rax, %r12
-; SSSE3-NEXT:    addq %r15, %r12
-; SSSE3-NEXT:    adcq $0, %r14
+; SSSE3-NEXT:    mulq %r15
+; SSSE3-NEXT:    movq %rdx, %r15
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    addq %r12, %r13
+; SSSE3-NEXT:    adcq $0, %r15
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %r9
-; SSSE3-NEXT:    movq %rdx, %r15
+; SSSE3-NEXT:    movq %rdx, %r12
 ; SSSE3-NEXT:    movq %rax, %r10
-; SSSE3-NEXT:    addq %r12, %r10
-; SSSE3-NEXT:    adcq %r14, %r15
+; SSSE3-NEXT:    addq %r13, %r10
+; SSSE3-NEXT:    adcq %r15, %r12
 ; SSSE3-NEXT:    setb %al
-; SSSE3-NEXT:    movzbl %al, %r14d
+; SSSE3-NEXT:    movzbl %al, %r15d
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r9
-; SSSE3-NEXT:    addq %r15, %rax
-; SSSE3-NEXT:    adcq %r14, %rdx
-; SSSE3-NEXT:    addq %rbx, %rax
-; SSSE3-NEXT:    adcq %rcx, %rdx
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSSE3-NEXT:    movq %r10, 8(%r15)
+; SSSE3-NEXT:    addq %r12, %rax
+; SSSE3-NEXT:    adcq %r15, %rdx
+; SSSE3-NEXT:    addq %r14, %rax
+; SSSE3-NEXT:    adcq %rbx, %rdx
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSSE3-NEXT:    movq %r10, 8(%r12)
 ; SSSE3-NEXT:    sarq $63, %r10
 ; SSSE3-NEXT:    xorq %r10, %rdx
 ; SSSE3-NEXT:    xorq %rax, %r10
-; SSSE3-NEXT:    xorl %ecx, %ecx
+; SSSE3-NEXT:    xorl %r15d, %r15d
 ; SSSE3-NEXT:    orq %rdx, %r10
-; SSSE3-NEXT:    setne %cl
-; SSSE3-NEXT:    movq %r13, %r9
+; SSSE3-NEXT:    setne %r15b
+; SSSE3-NEXT:    movq %rcx, %r9
 ; SSSE3-NEXT:    sarq $63, %r9
 ; SSSE3-NEXT:    movq %rbp, %r11
 ; SSSE3-NEXT:    imulq %r9, %r11
 ; SSSE3-NEXT:    movq %rsi, %rax
 ; SSSE3-NEXT:    mulq %r9
-; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %rax, %r11
-; SSSE3-NEXT:    addq %rdx, %r11
+; SSSE3-NEXT:    movq %rdx, %r9
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    addq %rax, %r9
+; SSSE3-NEXT:    addq %r11, %r9
 ; SSSE3-NEXT:    movq %rbp, %rax
 ; SSSE3-NEXT:    sarq $63, %rax
 ; SSSE3-NEXT:    movq %rax, %r14
-; SSSE3-NEXT:    imulq %r13, %r14
+; SSSE3-NEXT:    imulq %rcx, %r14
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %r10
-; SSSE3-NEXT:    addq %rax, %r14
-; SSSE3-NEXT:    addq %rdx, %r14
-; SSSE3-NEXT:    addq %r9, %r10
-; SSSE3-NEXT:    adcq %r11, %r14
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    movq %rdx, %rbx
+; SSSE3-NEXT:    addq %r14, %rbx
+; SSSE3-NEXT:    addq %rax, %rbx
+; SSSE3-NEXT:    addq %r10, %r11
+; SSSE3-NEXT:    adcq %r9, %rbx
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %r9
-; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    movq %r13, %rax
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movq %rcx, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    addq %r9, %rbx
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    addq %r9, %r14
 ; SSSE3-NEXT:    adcq $0, %rsi
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %rbx, %r9
+; SSSE3-NEXT:    addq %r14, %r9
 ; SSSE3-NEXT:    adcq %rsi, %r8
 ; SSSE3-NEXT:    setb %al
 ; SSSE3-NEXT:    movzbl %al, %esi
-; SSSE3-NEXT:    movq %r13, %rax
+; SSSE3-NEXT:    movq %rcx, %rax
 ; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    addq %r8, %rax
 ; SSSE3-NEXT:    adcq %rsi, %rdx
-; SSSE3-NEXT:    addq %r10, %rax
-; SSSE3-NEXT:    adcq %r14, %rdx
-; SSSE3-NEXT:    movq %r9, 24(%r15)
+; SSSE3-NEXT:    addq %r11, %rax
+; SSSE3-NEXT:    adcq %rbx, %rdx
+; SSSE3-NEXT:    movq %r9, 24(%r12)
 ; SSSE3-NEXT:    sarq $63, %r9
 ; SSSE3-NEXT:    xorq %r9, %rdx
 ; SSSE3-NEXT:    xorq %rax, %r9
@@ -3545,11 +3551,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    negl %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    negl %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    negl %r15d
+; SSSE3-NEXT:    movd %r15d, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %r11, 16(%r15)
-; SSSE3-NEXT:    movq %rdi, (%r15)
+; SSSE3-NEXT:    movq %r10, 16(%r12)
+; SSSE3-NEXT:    movq %rdi, (%r12)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
 ; SSSE3-NEXT:    popq %r13
@@ -3566,108 +3572,111 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    pushq %r13
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
-; SSE41-NEXT:    movq %r8, %r14
-; SSE41-NEXT:    movq %rcx, %r13
+; SSE41-NEXT:    movq %r8, %r15
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rsi, %r11
 ; SSE41-NEXT:    movq %rdi, %r10
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movq %r11, %rcx
-; SSE41-NEXT:    sarq $63, %rcx
-; SSE41-NEXT:    movq %r9, %r15
-; SSE41-NEXT:    imulq %rcx, %r15
-; SSE41-NEXT:    movq %r14, %rax
-; SSE41-NEXT:    mulq %rcx
-; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    addq %rax, %r15
-; SSE41-NEXT:    addq %rdx, %r15
+; SSE41-NEXT:    movq %r11, %rdi
+; SSE41-NEXT:    sarq $63, %rdi
+; SSE41-NEXT:    movq %r9, %rbx
+; SSE41-NEXT:    imulq %rdi, %rbx
+; SSE41-NEXT:    movq %r15, %rax
+; SSE41-NEXT:    mulq %rdi
+; SSE41-NEXT:    movq %rdx, %rdi
+; SSE41-NEXT:    movq %rax, %r12
+; SSE41-NEXT:    addq %rax, %rdi
+; SSE41-NEXT:    addq %rbx, %rdi
 ; SSE41-NEXT:    movq %r9, %rax
 ; SSE41-NEXT:    sarq $63, %rax
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    imulq %r11, %rcx
+; SSE41-NEXT:    movq %rax, %r13
+; SSE41-NEXT:    imulq %r11, %r13
 ; SSE41-NEXT:    mulq %r10
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    addq %rax, %rcx
-; SSE41-NEXT:    addq %rdx, %rcx
-; SSE41-NEXT:    addq %rdi, %rbx
-; SSE41-NEXT:    adcq %r15, %rcx
+; SSE41-NEXT:    movq %rax, %r14
+; SSE41-NEXT:    movq %rdx, %rbx
+; SSE41-NEXT:    addq %r13, %rbx
+; SSE41-NEXT:    addq %rax, %rbx
+; SSE41-NEXT:    addq %r12, %r14
+; SSE41-NEXT:    adcq %rdi, %rbx
 ; SSE41-NEXT:    movq %r10, %rax
-; SSE41-NEXT:    mulq %r14
-; SSE41-NEXT:    movq %rdx, %r15
+; SSE41-NEXT:    mulq %r15
+; SSE41-NEXT:    movq %rdx, %r12
 ; SSE41-NEXT:    movq %rax, %rdi
 ; SSE41-NEXT:    movq %r11, %rax
-; SSE41-NEXT:    mulq %r14
-; SSE41-NEXT:    movq %rdx, %r14
-; SSE41-NEXT:    movq %rax, %r12
-; SSE41-NEXT:    addq %r15, %r12
-; SSE41-NEXT:    adcq $0, %r14
+; SSE41-NEXT:    mulq %r15
+; SSE41-NEXT:    movq %rdx, %r15
+; SSE41-NEXT:    movq %rax, %r13
+; SSE41-NEXT:    addq %r12, %r13
+; SSE41-NEXT:    adcq $0, %r15
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %r9
-; SSE41-NEXT:    movq %rdx, %r15
+; SSE41-NEXT:    movq %rdx, %r12
 ; SSE41-NEXT:    movq %rax, %r10
-; SSE41-NEXT:    addq %r12, %r10
-; SSE41-NEXT:    adcq %r14, %r15
+; SSE41-NEXT:    addq %r13, %r10
+; SSE41-NEXT:    adcq %r15, %r12
 ; SSE41-NEXT:    setb %al
-; SSE41-NEXT:    movzbl %al, %r14d
+; SSE41-NEXT:    movzbl %al, %r15d
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r9
-; SSE41-NEXT:    addq %r15, %rax
-; SSE41-NEXT:    adcq %r14, %rdx
-; SSE41-NEXT:    addq %rbx, %rax
-; SSE41-NEXT:    adcq %rcx, %rdx
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE41-NEXT:    movq %r10, 8(%r15)
+; SSE41-NEXT:    addq %r12, %rax
+; SSE41-NEXT:    adcq %r15, %rdx
+; SSE41-NEXT:    addq %r14, %rax
+; SSE41-NEXT:    adcq %rbx, %rdx
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT:    movq %r10, 8(%r12)
 ; SSE41-NEXT:    sarq $63, %r10
 ; SSE41-NEXT:    xorq %r10, %rdx
 ; SSE41-NEXT:    xorq %rax, %r10
-; SSE41-NEXT:    xorl %ecx, %ecx
+; SSE41-NEXT:    xorl %r15d, %r15d
 ; SSE41-NEXT:    orq %rdx, %r10
-; SSE41-NEXT:    setne %cl
-; SSE41-NEXT:    movq %r13, %r9
+; SSE41-NEXT:    setne %r15b
+; SSE41-NEXT:    movq %rcx, %r9
 ; SSE41-NEXT:    sarq $63, %r9
 ; SSE41-NEXT:    movq %rbp, %r11
 ; SSE41-NEXT:    imulq %r9, %r11
 ; SSE41-NEXT:    movq %rsi, %rax
 ; SSE41-NEXT:    mulq %r9
-; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %rax, %r11
-; SSE41-NEXT:    addq %rdx, %r11
+; SSE41-NEXT:    movq %rdx, %r9
+; SSE41-NEXT:    movq %rax, %r10
+; SSE41-NEXT:    addq %rax, %r9
+; SSE41-NEXT:    addq %r11, %r9
 ; SSE41-NEXT:    movq %rbp, %rax
 ; SSE41-NEXT:    sarq $63, %rax
 ; SSE41-NEXT:    movq %rax, %r14
-; SSE41-NEXT:    imulq %r13, %r14
+; SSE41-NEXT:    imulq %rcx, %r14
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %r10
-; SSE41-NEXT:    addq %rax, %r14
-; SSE41-NEXT:    addq %rdx, %r14
-; SSE41-NEXT:    addq %r9, %r10
-; SSE41-NEXT:    adcq %r11, %r14
+; SSE41-NEXT:    movq %rax, %r11
+; SSE41-NEXT:    movq %rdx, %rbx
+; SSE41-NEXT:    addq %r14, %rbx
+; SSE41-NEXT:    addq %rax, %rbx
+; SSE41-NEXT:    addq %r10, %r11
+; SSE41-NEXT:    adcq %r9, %rbx
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %r9
-; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    movq %r13, %rax
+; SSE41-NEXT:    movq %rax, %r10
+; SSE41-NEXT:    movq %rcx, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    addq %r9, %rbx
+; SSE41-NEXT:    movq %rax, %r14
+; SSE41-NEXT:    addq %r9, %r14
 ; SSE41-NEXT:    adcq $0, %rsi
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %rbx, %r9
+; SSE41-NEXT:    addq %r14, %r9
 ; SSE41-NEXT:    adcq %rsi, %r8
 ; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    movzbl %al, %esi
-; SSE41-NEXT:    movq %r13, %rax
+; SSE41-NEXT:    movq %rcx, %rax
 ; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    addq %r8, %rax
 ; SSE41-NEXT:    adcq %rsi, %rdx
-; SSE41-NEXT:    addq %r10, %rax
-; SSE41-NEXT:    adcq %r14, %rdx
-; SSE41-NEXT:    movq %r9, 24(%r15)
+; SSE41-NEXT:    addq %r11, %rax
+; SSE41-NEXT:    adcq %rbx, %rdx
+; SSE41-NEXT:    movq %r9, 24(%r12)
 ; SSE41-NEXT:    sarq $63, %r9
 ; SSE41-NEXT:    xorq %r9, %rdx
 ; SSE41-NEXT:    xorq %rax, %r9
@@ -3675,11 +3684,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    orq %rdx, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    negl %eax
-; SSE41-NEXT:    negl %ecx
-; SSE41-NEXT:    movd %ecx, %xmm0
+; SSE41-NEXT:    negl %r15d
+; SSE41-NEXT:    movd %r15d, %xmm0
 ; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movq %r11, 16(%r15)
-; SSE41-NEXT:    movq %rdi, (%r15)
+; SSE41-NEXT:    movq %r10, 16(%r12)
+; SSE41-NEXT:    movq %rdi, (%r12)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -3696,108 +3705,111 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    pushq %r13
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movq %r8, %r14
-; AVX-NEXT:    movq %rcx, %r13
+; AVX-NEXT:    movq %r8, %r15
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rsi, %r11
 ; AVX-NEXT:    movq %rdi, %r10
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movq %r11, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    movq %r9, %r15
-; AVX-NEXT:    imulq %rcx, %r15
-; AVX-NEXT:    movq %r14, %rax
-; AVX-NEXT:    mulq %rcx
-; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    addq %rax, %r15
-; AVX-NEXT:    addq %rdx, %r15
+; AVX-NEXT:    movq %r11, %rdi
+; AVX-NEXT:    sarq $63, %rdi
+; AVX-NEXT:    movq %r9, %rbx
+; AVX-NEXT:    imulq %rdi, %rbx
+; AVX-NEXT:    movq %r15, %rax
+; AVX-NEXT:    mulq %rdi
+; AVX-NEXT:    movq %rdx, %rdi
+; AVX-NEXT:    movq %rax, %r12
+; AVX-NEXT:    addq %rax, %rdi
+; AVX-NEXT:    addq %rbx, %rdi
 ; AVX-NEXT:    movq %r9, %rax
 ; AVX-NEXT:    sarq $63, %rax
-; AVX-NEXT:    movq %rax, %rcx
-; AVX-NEXT:    imulq %r11, %rcx
+; AVX-NEXT:    movq %rax, %r13
+; AVX-NEXT:    imulq %r11, %r13
 ; AVX-NEXT:    mulq %r10
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    addq %rax, %rcx
-; AVX-NEXT:    addq %rdx, %rcx
-; AVX-NEXT:    addq %rdi, %rbx
-; AVX-NEXT:    adcq %r15, %rcx
+; AVX-NEXT:    movq %rax, %r14
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    addq %r13, %rbx
+; AVX-NEXT:    addq %rax, %rbx
+; AVX-NEXT:    addq %r12, %r14
+; AVX-NEXT:    adcq %rdi, %rbx
 ; AVX-NEXT:    movq %r10, %rax
-; AVX-NEXT:    mulq %r14
-; AVX-NEXT:    movq %rdx, %r15
+; AVX-NEXT:    mulq %r15
+; AVX-NEXT:    movq %rdx, %r12
 ; AVX-NEXT:    movq %rax, %rdi
 ; AVX-NEXT:    movq %r11, %rax
-; AVX-NEXT:    mulq %r14
-; AVX-NEXT:    movq %rdx, %r14
-; AVX-NEXT:    movq %rax, %r12
-; AVX-NEXT:    addq %r15, %r12
-; AVX-NEXT:    adcq $0, %r14
+; AVX-NEXT:    mulq %r15
+; AVX-NEXT:    movq %rdx, %r15
+; AVX-NEXT:    movq %rax, %r13
+; AVX-NEXT:    addq %r12, %r13
+; AVX-NEXT:    adcq $0, %r15
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %r9
-; AVX-NEXT:    movq %rdx, %r15
+; AVX-NEXT:    movq %rdx, %r12
 ; AVX-NEXT:    movq %rax, %r10
-; AVX-NEXT:    addq %r12, %r10
-; AVX-NEXT:    adcq %r14, %r15
+; AVX-NEXT:    addq %r13, %r10
+; AVX-NEXT:    adcq %r15, %r12
 ; AVX-NEXT:    setb %al
-; AVX-NEXT:    movzbl %al, %r14d
+; AVX-NEXT:    movzbl %al, %r15d
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r9
-; AVX-NEXT:    addq %r15, %rax
-; AVX-NEXT:    adcq %r14, %rdx
-; AVX-NEXT:    addq %rbx, %rax
-; AVX-NEXT:    adcq %rcx, %rdx
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX-NEXT:    movq %r10, 8(%r15)
+; AVX-NEXT:    addq %r12, %rax
+; AVX-NEXT:    adcq %r15, %rdx
+; AVX-NEXT:    addq %r14, %rax
+; AVX-NEXT:    adcq %rbx, %rdx
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT:    movq %r10, 8(%r12)
 ; AVX-NEXT:    sarq $63, %r10
 ; AVX-NEXT:    xorq %r10, %rdx
 ; AVX-NEXT:    xorq %rax, %r10
-; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    xorl %r15d, %r15d
 ; AVX-NEXT:    orq %rdx, %r10
-; AVX-NEXT:    setne %cl
-; AVX-NEXT:    movq %r13, %r9
+; AVX-NEXT:    setne %r15b
+; AVX-NEXT:    movq %rcx, %r9
 ; AVX-NEXT:    sarq $63, %r9
 ; AVX-NEXT:    movq %rbp, %r11
 ; AVX-NEXT:    imulq %r9, %r11
 ; AVX-NEXT:    movq %rsi, %rax
 ; AVX-NEXT:    mulq %r9
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %rax, %r11
-; AVX-NEXT:    addq %rdx, %r11
+; AVX-NEXT:    movq %rdx, %r9
+; AVX-NEXT:    movq %rax, %r10
+; AVX-NEXT:    addq %rax, %r9
+; AVX-NEXT:    addq %r11, %r9
 ; AVX-NEXT:    movq %rbp, %rax
 ; AVX-NEXT:    sarq $63, %rax
 ; AVX-NEXT:    movq %rax, %r14
-; AVX-NEXT:    imulq %r13, %r14
+; AVX-NEXT:    imulq %rcx, %r14
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %r10
-; AVX-NEXT:    addq %rax, %r14
-; AVX-NEXT:    addq %rdx, %r14
-; AVX-NEXT:    addq %r9, %r10
-; AVX-NEXT:    adcq %r11, %r14
+; AVX-NEXT:    movq %rax, %r11
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    addq %r14, %rbx
+; AVX-NEXT:    addq %rax, %rbx
+; AVX-NEXT:    addq %r10, %r11
+; AVX-NEXT:    adcq %r9, %rbx
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %r9
-; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    movq %r13, %rax
+; AVX-NEXT:    movq %rax, %r10
+; AVX-NEXT:    movq %rcx, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    addq %r9, %rbx
+; AVX-NEXT:    movq %rax, %r14
+; AVX-NEXT:    addq %r9, %r14
 ; AVX-NEXT:    adcq $0, %rsi
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %rbx, %r9
+; AVX-NEXT:    addq %r14, %r9
 ; AVX-NEXT:    adcq %rsi, %r8
 ; AVX-NEXT:    setb %al
 ; AVX-NEXT:    movzbl %al, %esi
-; AVX-NEXT:    movq %r13, %rax
+; AVX-NEXT:    movq %rcx, %rax
 ; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    addq %r8, %rax
 ; AVX-NEXT:    adcq %rsi, %rdx
-; AVX-NEXT:    addq %r10, %rax
-; AVX-NEXT:    adcq %r14, %rdx
-; AVX-NEXT:    movq %r9, 24(%r15)
+; AVX-NEXT:    addq %r11, %rax
+; AVX-NEXT:    adcq %rbx, %rdx
+; AVX-NEXT:    movq %r9, 24(%r12)
 ; AVX-NEXT:    sarq $63, %r9
 ; AVX-NEXT:    xorq %r9, %rdx
 ; AVX-NEXT:    xorq %rax, %r9
@@ -3805,11 +3817,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    orq %rdx, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    negl %eax
-; AVX-NEXT:    negl %ecx
-; AVX-NEXT:    vmovd %ecx, %xmm0
+; AVX-NEXT:    negl %r15d
+; AVX-NEXT:    vmovd %r15d, %xmm0
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movq %r11, 16(%r15)
-; AVX-NEXT:    movq %rdi, (%r15)
+; AVX-NEXT:    movq %r10, 16(%r12)
+; AVX-NEXT:    movq %rdi, (%r12)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -3826,104 +3838,110 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %r9, %rbp
 ; AVX512F-NEXT:    movq %rcx, %r11
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512F-NEXT:    movq %rsi, %r9
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %rbp, %r12
-; AVX512F-NEXT:    imulq %rcx, %r12
-; AVX512F-NEXT:    movq %r14, %rax
+; AVX512F-NEXT:    movq %rsi, %rbx
+; AVX512F-NEXT:    imulq %rcx, %rbx
+; AVX512F-NEXT:    movq %r15, %rax
 ; AVX512F-NEXT:    mulq %rcx
-; AVX512F-NEXT:    movq %rax, %r15
-; AVX512F-NEXT:    addq %rax, %r12
-; AVX512F-NEXT:    addq %rdx, %r12
-; AVX512F-NEXT:    movq %rbp, %rax
+; AVX512F-NEXT:    movq %rdx, %rcx
+; AVX512F-NEXT:    movq %rax, %r12
+; AVX512F-NEXT:    addq %rax, %rcx
+; AVX512F-NEXT:    addq %rbx, %rcx
+; AVX512F-NEXT:    movq %rsi, %rax
 ; AVX512F-NEXT:    sarq $63, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    imulq %r11, %rcx
+; AVX512F-NEXT:    movq %rax, %r13
+; AVX512F-NEXT:    imulq %r11, %r13
 ; AVX512F-NEXT:    mulq %r10
-; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    addq %rax, %rcx
-; AVX512F-NEXT:    addq %rdx, %rcx
-; AVX512F-NEXT:    addq %r15, %rbx
-; AVX512F-NEXT:    adcq %r12, %rcx
+; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    movq %rdx, %rbx
+; AVX512F-NEXT:    addq %r13, %rbx
+; AVX512F-NEXT:    addq %rax, %rbx
+; AVX512F-NEXT:    addq %r12, %r14
+; AVX512F-NEXT:    adcq %rcx, %rbx
 ; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %r14
-; AVX512F-NEXT:    movq %rdx, %r15
-; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT:    mulq %r15
+; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    movq %rax, %rcx
 ; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %r14
-; AVX512F-NEXT:    movq %rdx, %r14
-; AVX512F-NEXT:    movq %rax, %r12
-; AVX512F-NEXT:    addq %r15, %r12
-; AVX512F-NEXT:    adcq $0, %r14
-; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    mulq %r15
 ; AVX512F-NEXT:    movq %rdx, %r15
+; AVX512F-NEXT:    movq %rax, %r13
+; AVX512F-NEXT:    addq %r12, %r13
+; AVX512F-NEXT:    adcq $0, %r15
+; AVX512F-NEXT:    movq %r10, %rax
+; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    movq %rdx, %r12
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r12, %r10
-; AVX512F-NEXT:    adcq %r14, %r15
+; AVX512F-NEXT:    addq %r13, %r10
+; AVX512F-NEXT:    adcq %r15, %r12
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %r14d
+; AVX512F-NEXT:    movzbl %al, %r15d
 ; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %rbp
-; AVX512F-NEXT:    addq %r15, %rax
-; AVX512F-NEXT:    adcq %r14, %rdx
-; AVX512F-NEXT:    addq %rbx, %rax
-; AVX512F-NEXT:    adcq %rcx, %rdx
-; AVX512F-NEXT:    movq %r10, 24(%r13)
+; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    addq %r12, %rax
+; AVX512F-NEXT:    adcq %r15, %rdx
+; AVX512F-NEXT:    addq %r14, %rax
+; AVX512F-NEXT:    adcq %rbx, %rdx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT:    movq %r10, 24(%r12)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
 ; AVX512F-NEXT:    orq %rdx, %r10
 ; AVX512F-NEXT:    setne %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
-; AVX512F-NEXT:    movq %rsi, %rcx
-; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %r9, %rbx
-; AVX512F-NEXT:    imulq %rcx, %rbx
+; AVX512F-NEXT:    movq %r9, %rsi
+; AVX512F-NEXT:    sarq $63, %rsi
+; AVX512F-NEXT:    movq %rbp, %rbx
+; AVX512F-NEXT:    imulq %rsi, %rbx
 ; AVX512F-NEXT:    movq %r8, %rax
-; AVX512F-NEXT:    mulq %rcx
-; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %rax, %rbx
-; AVX512F-NEXT:    addq %rdx, %rbx
-; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    movq %rdx, %r10
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    addq %rax, %r10
+; AVX512F-NEXT:    addq %rbx, %r10
+; AVX512F-NEXT:    movq %rbp, %rax
 ; AVX512F-NEXT:    sarq $63, %rax
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    imulq %rsi, %rcx
+; AVX512F-NEXT:    movq %rax, %rsi
+; AVX512F-NEXT:    imulq %r9, %rsi
 ; AVX512F-NEXT:    mulq %rdi
-; AVX512F-NEXT:    movq %rax, %r11
-; AVX512F-NEXT:    addq %rax, %rcx
-; AVX512F-NEXT:    addq %rdx, %rcx
-; AVX512F-NEXT:    addq %r10, %r11
-; AVX512F-NEXT:    adcq %rbx, %rcx
+; AVX512F-NEXT:    movq %rax, %rbx
+; AVX512F-NEXT:    movq %rdx, %r14
+; AVX512F-NEXT:    addq %rsi, %r14
+; AVX512F-NEXT:    addq %rax, %r14
+; AVX512F-NEXT:    addq %r11, %rbx
+; AVX512F-NEXT:    adcq %r10, %r14
 ; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    movq %rsi, %rax
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    movq %r9, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r8
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    addq %r10, %r14
+; AVX512F-NEXT:    movq %rax, %r15
+; AVX512F-NEXT:    addq %r10, %r15
 ; AVX512F-NEXT:    adcq $0, %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    mulq %r9
+; AVX512F-NEXT:    mulq %rbp
 ; AVX512F-NEXT:    movq %rdx, %rdi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r14, %r10
+; AVX512F-NEXT:    addq %r15, %r10
 ; AVX512F-NEXT:    adcq %r8, %rdi
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %r8d
-; AVX512F-NEXT:    movq %rsi, %rax
-; AVX512F-NEXT:    mulq %r9
+; AVX512F-NEXT:    movzbl %al, %esi
+; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    mulq %rbp
 ; AVX512F-NEXT:    addq %rdi, %rax
-; AVX512F-NEXT:    adcq %r8, %rdx
-; AVX512F-NEXT:    addq %r11, %rax
-; AVX512F-NEXT:    adcq %rcx, %rdx
-; AVX512F-NEXT:    movq %r10, 8(%r13)
+; AVX512F-NEXT:    adcq %rsi, %rdx
+; AVX512F-NEXT:    addq %rbx, %rax
+; AVX512F-NEXT:    adcq %r14, %rdx
+; AVX512F-NEXT:    movq %r10, 8(%r12)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
@@ -3935,9 +3953,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    korw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT:    movq %rax, 16(%r13)
-; AVX512F-NEXT:    movq %rbx, (%r13)
+; AVX512F-NEXT:    movq %rcx, 16(%r12)
+; AVX512F-NEXT:    movq %r11, (%r12)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
 ; AVX512F-NEXT:    popq %r13
@@ -3954,104 +3971,110 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
+; AVX512BW-NEXT:    movq %r9, %rbp
 ; AVX512BW-NEXT:    movq %rcx, %r11
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512BW-NEXT:    movq %rsi, %r9
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX512BW-NEXT:    sarq $63, %rcx
-; AVX512BW-NEXT:    movq %rbp, %r12
-; AVX512BW-NEXT:    imulq %rcx, %r12
-; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    movq %rsi, %rbx
+; AVX512BW-NEXT:    imulq %rcx, %rbx
+; AVX512BW-NEXT:    movq %r15, %rax
 ; AVX512BW-NEXT:    mulq %rcx
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    addq %rax, %r12
-; AVX512BW-NEXT:    addq %rdx, %r12
-; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    movq %rdx, %rcx
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    addq %rbx, %rcx
+; AVX512BW-NEXT:    movq %rsi, %rax
 ; AVX512BW-NEXT:    sarq $63, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    imulq %r11, %rcx
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    imulq %r11, %r13
 ; AVX512BW-NEXT:    mulq %r10
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    addq %rdx, %rcx
-; AVX512BW-NEXT:    addq %r15, %rbx
-; AVX512BW-NEXT:    adcq %r12, %rcx
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    movq %rdx, %rbx
+; AVX512BW-NEXT:    addq %r13, %rbx
+; AVX512BW-NEXT:    addq %rax, %rbx
+; AVX512BW-NEXT:    addq %r12, %r14
+; AVX512BW-NEXT:    adcq %rcx, %rbx
 ; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %r14
-; AVX512BW-NEXT:    movq %rdx, %r15
-; AVX512BW-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    mulq %r15
+; AVX512BW-NEXT:    movq %rdx, %r12
+; AVX512BW-NEXT:    movq %rax, %rcx
 ; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %r14
-; AVX512BW-NEXT:    movq %rdx, %r14
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    addq %r15, %r12
-; AVX512BW-NEXT:    adcq $0, %r14
-; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    mulq %r15
 ; AVX512BW-NEXT:    movq %rdx, %r15
+; AVX512BW-NEXT:    movq %rax, %r13
+; AVX512BW-NEXT:    addq %r12, %r13
+; AVX512BW-NEXT:    adcq $0, %r15
+; AVX512BW-NEXT:    movq %r10, %rax
+; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    movq %rdx, %r12
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r12, %r10
-; AVX512BW-NEXT:    adcq %r14, %r15
+; AVX512BW-NEXT:    addq %r13, %r10
+; AVX512BW-NEXT:    adcq %r15, %r12
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %r14d
+; AVX512BW-NEXT:    movzbl %al, %r15d
 ; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %rbp
-; AVX512BW-NEXT:    addq %r15, %rax
-; AVX512BW-NEXT:    adcq %r14, %rdx
-; AVX512BW-NEXT:    addq %rbx, %rax
-; AVX512BW-NEXT:    adcq %rcx, %rdx
-; AVX512BW-NEXT:    movq %r10, 24(%r13)
+; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    addq %r12, %rax
+; AVX512BW-NEXT:    adcq %r15, %rdx
+; AVX512BW-NEXT:    addq %r14, %rax
+; AVX512BW-NEXT:    adcq %rbx, %rdx
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT:    movq %r10, 24(%r12)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
 ; AVX512BW-NEXT:    orq %rdx, %r10
 ; AVX512BW-NEXT:    setne %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    movq %rsi, %rcx
-; AVX512BW-NEXT:    sarq $63, %rcx
-; AVX512BW-NEXT:    movq %r9, %rbx
-; AVX512BW-NEXT:    imulq %rcx, %rbx
+; AVX512BW-NEXT:    movq %r9, %rsi
+; AVX512BW-NEXT:    sarq $63, %rsi
+; AVX512BW-NEXT:    movq %rbp, %rbx
+; AVX512BW-NEXT:    imulq %rsi, %rbx
 ; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    mulq %rcx
-; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %rax, %rbx
-; AVX512BW-NEXT:    addq %rdx, %rbx
-; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    movq %rdx, %r10
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    addq %rax, %r10
+; AVX512BW-NEXT:    addq %rbx, %r10
+; AVX512BW-NEXT:    movq %rbp, %rax
 ; AVX512BW-NEXT:    sarq $63, %rax
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    imulq %rsi, %rcx
+; AVX512BW-NEXT:    movq %rax, %rsi
+; AVX512BW-NEXT:    imulq %r9, %rsi
 ; AVX512BW-NEXT:    mulq %rdi
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    addq %rdx, %rcx
-; AVX512BW-NEXT:    addq %r10, %r11
-; AVX512BW-NEXT:    adcq %rbx, %rcx
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    movq %rdx, %r14
+; AVX512BW-NEXT:    addq %rsi, %r14
+; AVX512BW-NEXT:    addq %rax, %r14
+; AVX512BW-NEXT:    addq %r11, %rbx
+; AVX512BW-NEXT:    adcq %r10, %r14
 ; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    movq %rsi, %rax
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    movq %r9, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    addq %r10, %r14
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    addq %r10, %r15
 ; AVX512BW-NEXT:    adcq $0, %r8
 ; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    mulq %r9
+; AVX512BW-NEXT:    mulq %rbp
 ; AVX512BW-NEXT:    movq %rdx, %rdi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r14, %r10
+; AVX512BW-NEXT:    addq %r15, %r10
 ; AVX512BW-NEXT:    adcq %r8, %rdi
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %r8d
-; AVX512BW-NEXT:    movq %rsi, %rax
-; AVX512BW-NEXT:    mulq %r9
+; AVX512BW-NEXT:    movzbl %al, %esi
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    mulq %rbp
 ; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    adcq %r8, %rdx
-; AVX512BW-NEXT:    addq %r11, %rax
-; AVX512BW-NEXT:    adcq %rcx, %rdx
-; AVX512BW-NEXT:    movq %r10, 8(%r13)
+; AVX512BW-NEXT:    adcq %rsi, %rdx
+; AVX512BW-NEXT:    addq %rbx, %rax
+; AVX512BW-NEXT:    adcq %r14, %rdx
+; AVX512BW-NEXT:    movq %r10, 8(%r12)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
@@ -4063,9 +4086,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    korw %k0, %k1, %k1
 ; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT:    movq %rax, 16(%r13)
-; AVX512BW-NEXT:    movq %rbx, (%r13)
+; AVX512BW-NEXT:    movq %rcx, 16(%r12)
+; AVX512BW-NEXT:    movq %r11, (%r12)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
 ; AVX512BW-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index e556e7a7d6e7b..e98a6918c8295 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2932,6 +2932,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    mulq %rdi
 ; SSE2-NEXT:    seto %r12b
 ; SSE2-NEXT:    orb %r15b, %r12b
+; SSE2-NEXT:    orb %bpl, %r12b
 ; SSE2-NEXT:    leaq (%rsi,%rax), %r10
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    mulq %r8
@@ -2940,7 +2941,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    addq %r10, %rsi
 ; SSE2-NEXT:    setb %r10b
 ; SSE2-NEXT:    orb %r12b, %r10b
-; SSE2-NEXT:    orb %bpl, %r10b
 ; SSE2-NEXT:    testq %r9, %r9
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    testq %r11, %r11
@@ -2954,13 +2954,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    mulq %rcx
 ; SSE2-NEXT:    seto %r9b
 ; SSE2-NEXT:    orb %r11b, %r9b
+; SSE2-NEXT:    orb %bpl, %r9b
 ; SSE2-NEXT:    addq %rax, %r8
 ; SSE2-NEXT:    movq %rcx, %rax
 ; SSE2-NEXT:    mulq %r14
 ; SSE2-NEXT:    addq %r8, %rdx
 ; SSE2-NEXT:    setb %cl
 ; SSE2-NEXT:    orb %r9b, %cl
-; SSE2-NEXT:    orb %bpl, %cl
 ; SSE2-NEXT:    movzbl %cl, %ecx
 ; SSE2-NEXT:    negl %ecx
 ; SSE2-NEXT:    movd %ecx, %xmm1
@@ -3005,6 +3005,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    mulq %rdi
 ; SSSE3-NEXT:    seto %r12b
 ; SSSE3-NEXT:    orb %r15b, %r12b
+; SSSE3-NEXT:    orb %bpl, %r12b
 ; SSSE3-NEXT:    leaq (%rsi,%rax), %r10
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    mulq %r8
@@ -3013,7 +3014,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    addq %r10, %rsi
 ; SSSE3-NEXT:    setb %r10b
 ; SSSE3-NEXT:    orb %r12b, %r10b
-; SSSE3-NEXT:    orb %bpl, %r10b
 ; SSSE3-NEXT:    testq %r9, %r9
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    testq %r11, %r11
@@ -3027,13 +3027,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    mulq %rcx
 ; SSSE3-NEXT:    seto %r9b
 ; SSSE3-NEXT:    orb %r11b, %r9b
+; SSSE3-NEXT:    orb %bpl, %r9b
 ; SSSE3-NEXT:    addq %rax, %r8
 ; SSSE3-NEXT:    movq %rcx, %rax
 ; SSSE3-NEXT:    mulq %r14
 ; SSSE3-NEXT:    addq %r8, %rdx
 ; SSSE3-NEXT:    setb %cl
 ; SSSE3-NEXT:    orb %r9b, %cl
-; SSSE3-NEXT:    orb %bpl, %cl
 ; SSSE3-NEXT:    movzbl %cl, %ecx
 ; SSSE3-NEXT:    negl %ecx
 ; SSSE3-NEXT:    movd %ecx, %xmm1
@@ -3078,6 +3078,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    mulq %rdi
 ; SSE41-NEXT:    seto %r12b
 ; SSE41-NEXT:    orb %r15b, %r12b
+; SSE41-NEXT:    orb %bpl, %r12b
 ; SSE41-NEXT:    leaq (%rsi,%rax), %r10
 ; SSE41-NEXT:    movq %rdi, %rax
 ; SSE41-NEXT:    mulq %r8
@@ -3086,7 +3087,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    addq %r10, %rsi
 ; SSE41-NEXT:    setb %r10b
 ; SSE41-NEXT:    orb %r12b, %r10b
-; SSE41-NEXT:    orb %bpl, %r10b
 ; SSE41-NEXT:    testq %r9, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    testq %r11, %r11
@@ -3100,13 +3100,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    mulq %rcx
 ; SSE41-NEXT:    seto %r9b
 ; SSE41-NEXT:    orb %r11b, %r9b
+; SSE41-NEXT:    orb %bpl, %r9b
 ; SSE41-NEXT:    addq %rax, %r8
 ; SSE41-NEXT:    movq %rcx, %rax
 ; SSE41-NEXT:    mulq %r14
 ; SSE41-NEXT:    addq %r8, %rdx
 ; SSE41-NEXT:    setb %cl
 ; SSE41-NEXT:    orb %r9b, %cl
-; SSE41-NEXT:    orb %bpl, %cl
 ; SSE41-NEXT:    movzbl %cl, %ecx
 ; SSE41-NEXT:    negl %ecx
 ; SSE41-NEXT:    movzbl %r10b, %r8d
@@ -3150,6 +3150,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    mulq %rdi
 ; AVX-NEXT:    seto %r12b
 ; AVX-NEXT:    orb %r15b, %r12b
+; AVX-NEXT:    orb %bpl, %r12b
 ; AVX-NEXT:    leaq (%rsi,%rax), %r10
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    mulq %r8
@@ -3158,7 +3159,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    addq %r10, %rsi
 ; AVX-NEXT:    setb %r10b
 ; AVX-NEXT:    orb %r12b, %r10b
-; AVX-NEXT:    orb %bpl, %r10b
 ; AVX-NEXT:    testq %r9, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    testq %r11, %r11
@@ -3172,13 +3172,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    mulq %rcx
 ; AVX-NEXT:    seto %r9b
 ; AVX-NEXT:    orb %r11b, %r9b
+; AVX-NEXT:    orb %bpl, %r9b
 ; AVX-NEXT:    addq %rax, %r8
 ; AVX-NEXT:    movq %rcx, %rax
 ; AVX-NEXT:    mulq %r14
 ; AVX-NEXT:    addq %r8, %rdx
 ; AVX-NEXT:    setb %cl
 ; AVX-NEXT:    orb %r9b, %cl
-; AVX-NEXT:    orb %bpl, %cl
 ; AVX-NEXT:    movzbl %cl, %ecx
 ; AVX-NEXT:    negl %ecx
 ; AVX-NEXT:    movzbl %r10b, %r8d
@@ -3221,6 +3221,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    mulq %rcx
 ; AVX512F-NEXT:    seto %r12b
 ; AVX512F-NEXT:    orb %r15b, %r12b
+; AVX512F-NEXT:    orb %bpl, %r12b
 ; AVX512F-NEXT:    addq %rax, %r11
 ; AVX512F-NEXT:    movq %rcx, %rax
 ; AVX512F-NEXT:    mulq %r14
@@ -3229,7 +3230,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    addq %r11, %rcx
 ; AVX512F-NEXT:    setb %al
 ; AVX512F-NEXT:    orb %r12b, %al
-; AVX512F-NEXT:    orb %bpl, %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
 ; AVX512F-NEXT:    testq %r9, %r9
 ; AVX512F-NEXT:    setne %al
@@ -3244,13 +3244,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    mulq %rdi
 ; AVX512F-NEXT:    seto %r9b
 ; AVX512F-NEXT:    orb %bpl, %r9b
+; AVX512F-NEXT:    orb %r11b, %r9b
 ; AVX512F-NEXT:    addq %rax, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    addq %r10, %rdx
 ; AVX512F-NEXT:    setb %dil
 ; AVX512F-NEXT:    orb %r9b, %dil
-; AVX512F-NEXT:    orb %r11b, %dil
 ; AVX512F-NEXT:    andl $1, %edi
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
@@ -3293,6 +3293,7 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    mulq %rcx
 ; AVX512BW-NEXT:    seto %r12b
 ; AVX512BW-NEXT:    orb %r15b, %r12b
+; AVX512BW-NEXT:    orb %bpl, %r12b
 ; AVX512BW-NEXT:    addq %rax, %r11
 ; AVX512BW-NEXT:    movq %rcx, %rax
 ; AVX512BW-NEXT:    mulq %r14
@@ -3301,7 +3302,6 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    addq %r11, %rcx
 ; AVX512BW-NEXT:    setb %al
 ; AVX512BW-NEXT:    orb %r12b, %al
-; AVX512BW-NEXT:    orb %bpl, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
 ; AVX512BW-NEXT:    testq %r9, %r9
 ; AVX512BW-NEXT:    setne %al
@@ -3316,13 +3316,13 @@ define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    mulq %rdi
 ; AVX512BW-NEXT:    seto %r9b
 ; AVX512BW-NEXT:    orb %bpl, %r9b
+; AVX512BW-NEXT:    orb %r11b, %r9b
 ; AVX512BW-NEXT:    addq %rax, %r10
 ; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    addq %r10, %rdx
 ; AVX512BW-NEXT:    setb %dil
 ; AVX512BW-NEXT:    orb %r9b, %dil
-; AVX512BW-NEXT:    orb %r11b, %dil
 ; AVX512BW-NEXT:    andl $1, %edi
 ; AVX512BW-NEXT:    kmovw %edi, %k1
 ; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0

diff  --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 176276fec26a3..76cd08d375f32 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -457,25 +457,25 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; SSE2-NEXT:    pandn %xmm3, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
 ; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pslld $23, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT:    paddd %xmm5, %xmm4
-; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
-; SSE2-NEXT:    pslld $16, %xmm4
-; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $23, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    pslld $23, %xmm2
-; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm2
 ; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
 ; SSE2-NEXT:    pslld $16, %xmm2
 ; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    packssdw %xmm4, %xmm2
+; SSE2-NEXT:    packssdw %xmm1, %xmm2
 ; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -687,25 +687,25 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
 ; X86-SSE2-NEXT:    psrlw $1, %xmm3
 ; X86-SSE2-NEXT:    pand %xmm4, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
 ; X86-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT:    pslld $23, %xmm4
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
-; X86-SSE2-NEXT:    paddd %xmm5, %xmm4
-; X86-SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
-; X86-SSE2-NEXT:    pslld $16, %xmm4
-; X86-SSE2-NEXT:    psrad $16, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT:    pslld $23, %xmm1
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; X86-SSE2-NEXT:    paddd %xmm4, %xmm1
+; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; X86-SSE2-NEXT:    pslld $16, %xmm1
+; X86-SSE2-NEXT:    psrad $16, %xmm1
 ; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
 ; X86-SSE2-NEXT:    pslld $23, %xmm2
-; X86-SSE2-NEXT:    paddd %xmm5, %xmm2
+; X86-SSE2-NEXT:    paddd %xmm4, %xmm2
 ; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
 ; X86-SSE2-NEXT:    pslld $16, %xmm2
 ; X86-SSE2-NEXT:    psrad $16, %xmm2
-; X86-SSE2-NEXT:    packssdw %xmm4, %xmm2
+; X86-SSE2-NEXT:    packssdw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm3, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
@@ -719,33 +719,33 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; SSE2-NEXT:    pand %xmm5, %xmm6
 ; SSE2-NEXT:    psllw $5, %xmm6
-; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    pandn %xmm1, %xmm7
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
 ; SSE2-NEXT:    paddb %xmm6, %xmm6
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    pandn %xmm4, %xmm7
-; SSE2-NEXT:    psrlw $2, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm7, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm7
+; SSE2-NEXT:    psrlw $2, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
 ; SSE2-NEXT:    paddb %xmm6, %xmm6
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pandn %xmm4, %xmm6
-; SSE2-NEXT:    psrlw $1, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm6, %xmm4
+; SSE2-NEXT:    pandn %xmm3, %xmm6
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    por %xmm6, %xmm3
 ; SSE2-NEXT:    pandn %xmm5, %xmm2
 ; SSE2-NEXT:    psllw $5, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -767,13 +767,13 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    paddb %xmm0, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: var_funnnel_v16i8:
@@ -964,33 +964,33 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    psllw $5, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
-; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
 ; X86-SSE2-NEXT:    psrlw $4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    por %xmm7, %xmm3
 ; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm7
-; X86-SSE2-NEXT:    psrlw $2, %xmm4
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm7
+; X86-SSE2-NEXT:    psrlw $2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    por %xmm7, %xmm3
 ; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
 ; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm6
-; X86-SSE2-NEXT:    psrlw $1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
-; X86-SSE2-NEXT:    por %xmm6, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    psrlw $1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    por %xmm6, %xmm3
 ; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
 ; X86-SSE2-NEXT:    psllw $5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -1012,13 +1012,13 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    paddb %xmm2, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X86-SSE2-NEXT:    por %xmm4, %xmm1
 ; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
-; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    por %xmm3, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
   ret <16 x i8> %res
@@ -1761,10 +1761,10 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; SSE2-NEXT:    pandn %xmm1, %xmm2
 ; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    paddw %xmm0, %xmm0
 ; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: constant_funnnel_v8i16:
@@ -1853,10 +1853,10 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
 ; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
   %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
   ret <8 x i16> %res

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 6b1a147b43c9d..350e0c193e0e7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -660,16 +660,16 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; SSE-NEXT:    punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535]
 ; SSE-NEXT:    pandn %xmm13, %xmm11
+; SSE-NEXT:    movdqa %xmm6, %xmm12
+; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    por %xmm11, %xmm12
 ; SSE-NEXT:    psrld $16, %xmm9
-; SSE-NEXT:    movdqa %xmm4, %xmm12
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; SSE-NEXT:    movdqa %xmm4, %xmm11
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535]
 ; SSE-NEXT:    movdqa %xmm0, %xmm13
-; SSE-NEXT:    pandn %xmm12, %xmm13
-; SSE-NEXT:    movdqa %xmm6, %xmm9
-; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    por %xmm11, %xmm13
-; SSE-NEXT:    por %xmm9, %xmm13
+; SSE-NEXT:    pandn %xmm11, %xmm13
+; SSE-NEXT:    por %xmm12, %xmm13
 ; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0]
 ; SSE-NEXT:    pand %xmm9, %xmm13
 ; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3]

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index dc4413cebacd7..95825285815c0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -724,51 +724,51 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ;
 ; AVX1-ONLY-LABEL: store_i8_stride5_vf16:
 ; AVX1-ONLY:       # %bb.0:
-; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX1-ONLY-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX1-ONLY-NEXT:    vmovdqa (%rdx), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa (%rcx), %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa (%r8), %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[6],zero,xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
 ; AVX1-ONLY-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9],zero,xmm1[u]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u]
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255>
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero
-; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero
-; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,xmm9[2,3],zero,zero,zero,xmm9[4,5],zero,zero
-; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[0,1],zero,zero,zero,xmm11[2,3],zero,zero,zero,xmm11[4,5],zero,zero,zero,xmm11[6]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
-; AVX1-ONLY-NEXT:    vpor %xmm12, %xmm10, %xmm10
-; AVX1-ONLY-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm9[6,7],zero,zero,zero,xmm9[8,9],zero,zero,zero,xmm9[10,11],zero,zero,zero
-; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6],zero,zero,zero,xmm1[9,8],zero,zero,zero,xmm1[11,10],zero,zero,zero,xmm1[13,12]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
-; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm9, %xmm2
-; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
-; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,xmm3[5,4],zero,zero,zero,xmm3[7,6],zero,zero,zero,xmm3[9,8]
-; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
-; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa %xmm2, 48(%r9)
-; AVX1-ONLY-NEXT:    vmovdqa %xmm1, 16(%r9)
-; AVX1-ONLY-NEXT:    vmovdqa %xmm10, (%r9)
+; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm8, %xmm6
+; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero
+; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6]
+; AVX1-ONLY-NEXT:    vpor %xmm9, %xmm10, %xmm9
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
+; AVX1-ONLY-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero
+; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12]
+; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm8, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
+; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
+; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8]
+; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
+; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-ONLY-NEXT:    vmovdqa %xmm1, 48(%r9)
+; AVX1-ONLY-NEXT:    vmovdqa %xmm2, 16(%r9)
+; AVX1-ONLY-NEXT:    vmovdqa %xmm9, (%r9)
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm6, %xmm0
-; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm8, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, 64(%r9)
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, 32(%r9)
 ; AVX1-ONLY-NEXT:    retq
@@ -809,8 +809,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-SLOW-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, 64(%r9)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm5, (%r9)
@@ -851,8 +851,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-FAST-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
-; AVX2-FAST-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, 64(%r9)
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, 32(%r9)
@@ -896,8 +896,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
+; AVX2-FAST-PERLANE-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
-; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, 64(%r9)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm5, (%r9)

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 268594c2f88f8..c250677f9d0a6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -428,14 +428,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX2-SLOW-NEXT:    vpor %ymm6, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-SLOW-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rax)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, (%rax)
@@ -508,14 +508,14 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT:    vpor %ymm2, %ymm6, %ymm2
+; AVX2-FAST-PERLANE-NEXT:    vpor %ymm6, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpor %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rax)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, (%rax)

diff  --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index c982ccec2d1e2..03d525ef0aa30 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -322,12 +322,12 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
-; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2]
+; SSE2-NEXT:    pand %xmm3, %xmm1
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 24c963f346074..4fbc41389dc5d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -114,10 +114,10 @@ define i64 @test_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    psrlq $60, %xmm2
 ; SSE2-NEXT:    psrlq $60, %xmm0
+; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $60, %xmm3
 ; SSE2-NEXT:    psrlq $60, %xmm1
 ; SSE2-NEXT:    paddq %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
@@ -128,10 +128,10 @@ define i64 @test_v8i64_v8i8(<8 x i64> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    psrlq $60, %xmm2
 ; SSE41-NEXT:    psrlq $60, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq $60, %xmm3
 ; SSE41-NEXT:    psrlq $60, %xmm1
 ; SSE41-NEXT:    paddq %xmm3, %xmm1
-; SSE41-NEXT:    paddq %xmm2, %xmm1
 ; SSE41-NEXT:    paddq %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
@@ -142,13 +142,13 @@ define i64 @test_v8i64_v8i8(<8 x i64> %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
@@ -188,19 +188,19 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [1,1]
 ; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    paddq %xmm5, %xmm1
 ; SSE2-NEXT:    pand %xmm8, %xmm7
 ; SSE2-NEXT:    pand %xmm8, %xmm3
 ; SSE2-NEXT:    paddq %xmm7, %xmm3
-; SSE2-NEXT:    paddq %xmm5, %xmm3
 ; SSE2-NEXT:    paddq %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm8, %xmm4
 ; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    pand %xmm8, %xmm6
 ; SSE2-NEXT:    pand %xmm8, %xmm2
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
-; SSE2-NEXT:    paddq %xmm4, %xmm2
-; SSE2-NEXT:    paddq %xmm3, %xmm2
 ; SSE2-NEXT:    paddq %xmm0, %xmm2
+; SSE2-NEXT:    paddq %xmm3, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -211,19 +211,19 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [1,1]
 ; SSE41-NEXT:    pand %xmm8, %xmm5
 ; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    paddq %xmm5, %xmm1
 ; SSE41-NEXT:    pand %xmm8, %xmm7
 ; SSE41-NEXT:    pand %xmm8, %xmm3
 ; SSE41-NEXT:    paddq %xmm7, %xmm3
-; SSE41-NEXT:    paddq %xmm5, %xmm3
 ; SSE41-NEXT:    paddq %xmm1, %xmm3
 ; SSE41-NEXT:    pand %xmm8, %xmm4
 ; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm6
 ; SSE41-NEXT:    pand %xmm8, %xmm2
 ; SSE41-NEXT:    paddq %xmm6, %xmm2
-; SSE41-NEXT:    paddq %xmm4, %xmm2
-; SSE41-NEXT:    paddq %xmm3, %xmm2
 ; SSE41-NEXT:    paddq %xmm0, %xmm2
+; SSE41-NEXT:    paddq %xmm3, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    movq %xmm0, %rax
@@ -237,16 +237,16 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
 ; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
@@ -258,10 +258,10 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) {
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1]
 ; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -508,10 +508,10 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm4, %xmm2
 ; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pand %xmm4, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
@@ -525,10 +525,10 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE41-NEXT:    pand %xmm4, %xmm2
 ; SSE41-NEXT:    pand %xmm4, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm0
 ; SSE41-NEXT:    pand %xmm4, %xmm3
 ; SSE41-NEXT:    pand %xmm4, %xmm1
 ; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    paddd %xmm2, %xmm1
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE41-NEXT:    paddd %xmm1, %xmm0
@@ -545,8 +545,8 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -563,8 +563,8 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) {
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
@@ -609,19 +609,19 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm8, %xmm5
 ; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm5, %xmm1
 ; SSE2-NEXT:    pand %xmm8, %xmm7
 ; SSE2-NEXT:    pand %xmm8, %xmm3
 ; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm8, %xmm4
 ; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    pand %xmm8, %xmm6
 ; SSE2-NEXT:    pand %xmm8, %xmm2
 ; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -634,19 +634,19 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE41-NEXT:    pand %xmm8, %xmm5
 ; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    paddd %xmm5, %xmm1
 ; SSE41-NEXT:    pand %xmm8, %xmm7
 ; SSE41-NEXT:    pand %xmm8, %xmm3
 ; SSE41-NEXT:    paddd %xmm7, %xmm3
-; SSE41-NEXT:    paddd %xmm5, %xmm3
 ; SSE41-NEXT:    paddd %xmm1, %xmm3
 ; SSE41-NEXT:    pand %xmm8, %xmm4
 ; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    paddd %xmm4, %xmm0
 ; SSE41-NEXT:    pand %xmm8, %xmm6
 ; SSE41-NEXT:    pand %xmm8, %xmm2
 ; SSE41-NEXT:    paddd %xmm6, %xmm2
-; SSE41-NEXT:    paddd %xmm4, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm2
 ; SSE41-NEXT:    paddd %xmm0, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    paddd %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -662,16 +662,16 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
 ; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-SLOW-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -688,16 +688,16 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
 ; AVX1-FAST-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-FAST-NEXT:    vandps %ymm4, %ymm3, %ymm3
 ; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
@@ -710,10 +710,10 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
 ; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -1144,9 +1144,9 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
 ; SSE2-NEXT:    psadbw %xmm1, %xmm2
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
 ; SSE2-NEXT:    psadbw %xmm1, %xmm4
-; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm2, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -1173,9 +1173,9 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
 ; SSE41-NEXT:    psadbw %xmm1, %xmm2
 ; SSE41-NEXT:    paddq %xmm6, %xmm2
 ; SSE41-NEXT:    psadbw %xmm1, %xmm4
-; SSE41-NEXT:    paddq %xmm2, %xmm4
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm0, %xmm1
 ; SSE41-NEXT:    movd %xmm1, %eax
@@ -1200,10 +1200,10 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
index f9972bf69cb22..dc0ebe6b2e2ef 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll
@@ -122,10 +122,10 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm4, %xmm5
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm5, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
@@ -137,14 +137,14 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
 ; SSE41-NEXT:    pmovsxbq %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psrlq $48, %xmm3
-; SSE41-NEXT:    pmovsxbq %xmm3, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlq $48, %xmm1
+; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm0
 ; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    paddq %xmm0, %xmm1
 ; SSE41-NEXT:    movq %xmm1, %rax
@@ -160,7 +160,7 @@ define i64 @test_v8i64_v8i8(<8 x i8> %a0) {
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -202,12 +202,12 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
 ; SSE2-LABEL: test_v16i64_v16i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
@@ -216,28 +216,28 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
 ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa %xmm0, %xmm8
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSE2-NEXT:    paddq %xmm5, %xmm8
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm4
-; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT:    movdqa %xmm4, %xmm10
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm11
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
-; SSE2-NEXT:    paddq %xmm10, %xmm11
-; SSE2-NEXT:    paddq %xmm5, %xmm11
-; SSE2-NEXT:    paddq %xmm8, %xmm11
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
+; SSE2-NEXT:    paddq %xmm9, %xmm10
+; SSE2-NEXT:    paddq %xmm8, %xmm10
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
 ; SSE2-NEXT:    paddq %xmm4, %xmm6
-; SSE2-NEXT:    paddq %xmm1, %xmm6
-; SSE2-NEXT:    paddq %xmm11, %xmm6
 ; SSE2-NEXT:    paddq %xmm0, %xmm6
+; SSE2-NEXT:    paddq %xmm10, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
 ; SSE2-NEXT:    paddq %xmm6, %xmm0
 ; SSE2-NEXT:    movq %xmm0, %rax
@@ -247,31 +247,31 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    pmovsxbq %xmm2, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psrlq $48, %xmm3
-; SSE41-NEXT:    pmovsxbq %xmm3, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    pmovsxbq %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
+; SSE41-NEXT:    pmovsxbq %xmm0, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; SSE41-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE41-NEXT:    paddq %xmm0, %xmm3
 ; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE41-NEXT:    paddq %xmm0, %xmm3
-; SSE41-NEXT:    pmovsxbq %xmm4, %xmm0
-; SSE41-NEXT:    pmovsxbq %xmm5, %xmm1
-; SSE41-NEXT:    pmovsxbq %xmm6, %xmm4
-; SSE41-NEXT:    paddq %xmm1, %xmm4
-; SSE41-NEXT:    paddq %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm3, %xmm4
-; SSE41-NEXT:    paddq %xmm2, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE41-NEXT:    pmovsxbq %xmm1, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    pmovsxbq %xmm2, %xmm0
+; SSE41-NEXT:    psrlq $48, %xmm3
+; SSE41-NEXT:    pmovsxbq %xmm3, %xmm2
+; SSE41-NEXT:    paddq %xmm0, %xmm2
+; SSE41-NEXT:    paddq %xmm1, %xmm2
+; SSE41-NEXT:    pmovsxbq %xmm5, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
+; SSE41-NEXT:    pmovsxbq %xmm6, %xmm1
+; SSE41-NEXT:    pmovsxbq %xmm7, %xmm3
+; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm0, %xmm3
+; SSE41-NEXT:    paddq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    retq
 ;
@@ -283,23 +283,23 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
 ; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
 ; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT:    vpaddq %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddq %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3]
+; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3]
+; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
+; AVX1-NEXT:    vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -317,7 +317,7 @@ define i64 @test_v16i64_v16i8(<16 x i8> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
 ; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -562,12 +562,12 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    psrad $24, %xmm3
+; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm3, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
@@ -581,13 +581,13 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
 ; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE41-NEXT:    pmovsxbd %xmm3, %xmm3
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    paddd %xmm3, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
 ; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -600,12 +600,12 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm1
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -619,12 +619,12 @@ define i32 @test_v16i32_v16i8(<16 x i8> %a0) {
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm1
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -676,27 +676,27 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
 ; SSE2-NEXT:    psrad $24, %xmm5
+; SSE2-NEXT:    paddd %xmm3, %xmm5
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    psrad $24, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    psrad $24, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    psrad $24, %xmm7
-; SSE2-NEXT:    paddd %xmm6, %xmm7
-; SSE2-NEXT:    paddd %xmm3, %xmm7
-; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    psrad $24, %xmm6
+; SSE2-NEXT:    paddd %xmm3, %xmm6
+; SSE2-NEXT:    paddd %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $24, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; SSE2-NEXT:    psrad $24, %xmm3
+; SSE2-NEXT:    paddd %xmm2, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    paddd %xmm7, %xmm0
 ; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm6, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -710,23 +710,23 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
 ; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    pmovsxbd %xmm3, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
+; SSE41-NEXT:    paddd %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
 ; SSE41-NEXT:    pmovsxbd %xmm4, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; SSE41-NEXT:    pmovsxbd %xmm5, %xmm5
-; SSE41-NEXT:    paddd %xmm4, %xmm5
-; SSE41-NEXT:    paddd %xmm2, %xmm5
-; SSE41-NEXT:    paddd %xmm3, %xmm5
+; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    paddd %xmm3, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbd %xmm2, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbd %xmm3, %xmm3
 ; SSE41-NEXT:    paddd %xmm2, %xmm3
 ; SSE41-NEXT:    pmovsxbd %xmm1, %xmm1
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    paddd %xmm5, %xmm1
 ; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
 ; SSE41-NEXT:    paddd %xmm1, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm4, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -741,23 +741,23 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; AVX1-SLOW-NEXT:    vpmovsxbd %xmm4, %xmm4
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT:    vpmovsxbd %xmm5, %xmm5
-; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT:    vpmovsxbd %xmm4, %xmm4
+; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm3, %xmm3
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm4, %xmm4
 ; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -773,23 +773,23 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm2, %xmm2
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; AVX1-FAST-NEXT:    vpmovsxbd %xmm4, %xmm4
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT:    vpmovsxbd %xmm5, %xmm5
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
 ; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT:    vpmovsxbd %xmm4, %xmm4
+; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm3, %xmm3
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm4, %xmm4
 ; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
@@ -806,9 +806,9 @@ define i32 @test_v32i32_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1171,12 +1171,12 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm2, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm1
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm0
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
 ; SSE2-NEXT:    paddw %xmm3, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    paddw %xmm0, %xmm1
@@ -1197,9 +1197,9 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
 ; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
 ; SSE41-NEXT:    paddw %xmm2, %xmm3
 ; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    paddw %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    paddw %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -1220,9 +1220,9 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm3
 ; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1243,9 +1243,9 @@ define i16 @test_v32i16_v32i8(<32 x i8> %a0) {
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm3
 ; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1304,25 +1304,25 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; SSE2-NEXT:    psraw $8, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; SSE2-NEXT:    paddw %xmm4, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE2-NEXT:    psraw $8, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
 ; SSE2-NEXT:    psraw $8, %xmm6
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
-; SSE2-NEXT:    psraw $8, %xmm7
-; SSE2-NEXT:    paddw %xmm6, %xmm7
-; SSE2-NEXT:    paddw %xmm4, %xmm7
-; SSE2-NEXT:    paddw %xmm5, %xmm7
+; SSE2-NEXT:    paddw %xmm4, %xmm6
+; SSE2-NEXT:    paddw %xmm5, %xmm6
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm3
+; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    paddw %xmm3, %xmm1
 ; SSE2-NEXT:    paddw %xmm2, %xmm1
-; SSE2-NEXT:    paddw %xmm7, %xmm1
 ; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm6, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1338,23 +1338,23 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmovsxbw %xmm2, %xmm4
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm5
-; SSE41-NEXT:    pmovsxbw %xmm3, %xmm6
-; SSE41-NEXT:    pmovsxbw %xmm1, %xmm7
-; SSE41-NEXT:    paddw %xmm6, %xmm7
-; SSE41-NEXT:    paddw %xmm4, %xmm7
+; SSE41-NEXT:    paddw %xmm4, %xmm5
+; SSE41-NEXT:    pmovsxbw %xmm3, %xmm4
+; SSE41-NEXT:    pmovsxbw %xmm1, %xmm6
+; SSE41-NEXT:    paddw %xmm4, %xmm6
+; SSE41-NEXT:    paddw %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; SSE41-NEXT:    pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT:    paddw %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; SSE41-NEXT:    pmovsxbw %xmm2, %xmm2
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; SSE41-NEXT:    pmovsxbw %xmm1, %xmm1
-; SSE41-NEXT:    paddw %xmm3, %xmm1
 ; SSE41-NEXT:    paddw %xmm2, %xmm1
 ; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    paddw %xmm7, %xmm1
-; SSE41-NEXT:    paddw %xmm5, %xmm1
+; SSE41-NEXT:    paddw %xmm6, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1370,25 +1370,25 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; AVX1-SLOW:       # %bb.0:
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm2
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm3
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-SLOW-NEXT:    vpmovsxbw %xmm4, %xmm5
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-SLOW-NEXT:    vpmovsxbw %xmm6, %xmm7
-; AVX1-SLOW-NEXT:    vpaddw %xmm5, %xmm7, %xmm5
-; AVX1-SLOW-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm4
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-SLOW-NEXT:    vpmovsxbw %xmm5, %xmm6
+; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm6, %xmm4
+; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpmovsxbw %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX1-SLOW-NEXT:    vpmovsxbw %xmm4, %xmm4
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX1-SLOW-NEXT:    vpmovsxbw %xmm5, %xmm5
-; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm5, %xmm4
-; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; AVX1-SLOW-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
-; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1404,25 +1404,25 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; AVX1-FAST:       # %bb.0:
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm2
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm3
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-FAST-NEXT:    vpmovsxbw %xmm4, %xmm5
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-FAST-NEXT:    vpmovsxbw %xmm6, %xmm7
-; AVX1-FAST-NEXT:    vpaddw %xmm5, %xmm7, %xmm5
-; AVX1-FAST-NEXT:    vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm4
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-FAST-NEXT:    vpmovsxbw %xmm5, %xmm6
+; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm6, %xmm4
+; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpmovsxbw %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX1-FAST-NEXT:    vpmovsxbw %xmm4, %xmm4
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX1-FAST-NEXT:    vpmovsxbw %xmm5, %xmm5
-; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm5, %xmm4
-; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX1-FAST-NEXT:    vpmovsxbw %xmm1, %xmm1
+; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; AVX1-FAST-NEXT:    vpmovsxbw %xmm3, %xmm3
+; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
-; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1437,13 +1437,13 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm2
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpaddw %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1811,12 +1811,12 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) {
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE2-NEXT:    paddb %xmm5, %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpgtb %xmm3, %xmm0
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
 ; SSE2-NEXT:    paddb %xmm0, %xmm3
-; SSE2-NEXT:    paddb %xmm5, %xmm3
 ; SSE2-NEXT:    paddb %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; SSE2-NEXT:    paddb %xmm3, %xmm0
@@ -1832,12 +1832,12 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) {
 ; SSE41-NEXT:    pcmpgtb %xmm2, %xmm5
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE41-NEXT:    paddb %xmm5, %xmm2
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpgtb %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm3
-; SSE41-NEXT:    paddb %xmm5, %xmm3
 ; SSE41-NEXT:    paddb %xmm2, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; SSE41-NEXT:    paddb %xmm3, %xmm0
@@ -1851,13 +1851,13 @@ define i8 @test_v64i8_v64i1(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
@@ -1913,27 +1913,27 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
 ; SSE2-NEXT:    pcmpgtb %xmm4, %xmm9
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtb %xmm0, %xmm4
+; SSE2-NEXT:    paddb %xmm9, %xmm4
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpgtb %xmm6, %xmm0
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pcmpgtb %xmm2, %xmm6
 ; SSE2-NEXT:    paddb %xmm0, %xmm6
-; SSE2-NEXT:    paddb %xmm9, %xmm6
+; SSE2-NEXT:    paddb %xmm4, %xmm6
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm7, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm5
-; SSE2-NEXT:    paddb %xmm1, %xmm5
-; SSE2-NEXT:    paddb %xmm0, %xmm5
-; SSE2-NEXT:    paddb %xmm2, %xmm5
-; SSE2-NEXT:    paddb %xmm6, %xmm5
-; SSE2-NEXT:    paddb %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; SSE2-NEXT:    paddb %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    psadbw %xmm8, %xmm0
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
@@ -1946,27 +1946,27 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
 ; SSE41-NEXT:    pcmpgtb %xmm4, %xmm9
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    pcmpgtb %xmm0, %xmm4
+; SSE41-NEXT:    paddb %xmm9, %xmm4
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpgtb %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm6
 ; SSE41-NEXT:    pcmpgtb %xmm2, %xmm6
 ; SSE41-NEXT:    paddb %xmm0, %xmm6
-; SSE41-NEXT:    paddb %xmm9, %xmm6
+; SSE41-NEXT:    paddb %xmm4, %xmm6
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pcmpgtb %xmm5, %xmm0
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE41-NEXT:    paddb %xmm0, %xmm2
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pcmpgtb %xmm7, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pcmpgtb %xmm7, %xmm1
-; SSE41-NEXT:    pxor %xmm5, %xmm5
-; SSE41-NEXT:    pcmpgtb %xmm3, %xmm5
-; SSE41-NEXT:    paddb %xmm1, %xmm5
-; SSE41-NEXT:    paddb %xmm0, %xmm5
-; SSE41-NEXT:    paddb %xmm2, %xmm5
-; SSE41-NEXT:    paddb %xmm6, %xmm5
-; SSE41-NEXT:    paddb %xmm4, %xmm5
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
-; SSE41-NEXT:    paddb %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtb %xmm3, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm2, %xmm1
+; SSE41-NEXT:    paddb %xmm6, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    paddb %xmm1, %xmm0
 ; SSE41-NEXT:    psadbw %xmm8, %xmm0
 ; SSE41-NEXT:    movd %xmm0, %eax
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
@@ -1977,23 +1977,23 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm5
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm4, %xmm6
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm7
-; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm4, %xmm8
-; AVX1-NEXT:    vpaddb %xmm7, %xmm8, %xmm7
-; AVX1-NEXT:    vpaddb %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm6
+; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm4, %xmm7
+; AVX1-NEXT:    vpaddb %xmm6, %xmm7, %xmm6
+; AVX1-NEXT:    vpaddb %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
@@ -2007,10 +2007,10 @@ define i8 @test_v128i8_v128i1(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX2-NEXT:    vpcmpgtb %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpcmpgtb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm4, %ymm2
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm4, %ymm1
-; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 89d299ff1eb45..937ac3d2db885 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -594,9 +594,9 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; SSE-NEXT:    psadbw %xmm4, %xmm1
 ; SSE-NEXT:    paddq %xmm3, %xmm1
 ; SSE-NEXT:    psadbw %xmm4, %xmm2
-; SSE-NEXT:    paddq %xmm1, %xmm2
 ; SSE-NEXT:    psadbw %xmm4, %xmm0
 ; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    paddq %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    paddq %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -612,9 +612,9 @@ define i16 @test_v64i16_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 4438da88073b8..6cc0e1e73fcdb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -82,11 +82,11 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE-LABEL: test_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddq %xmm3, %xmm1
-; SSE-NEXT:    paddq %xmm2, %xmm1
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddq %xmm2, %xmm0
 ; SSE-NEXT:    paddq %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i64:
@@ -94,8 +94,8 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
@@ -132,30 +132,30 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE-LABEL: test_v16i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddq %xmm6, %xmm2
-; SSE-NEXT:    paddq %xmm7, %xmm3
-; SSE-NEXT:    paddq %xmm5, %xmm3
-; SSE-NEXT:    paddq %xmm1, %xmm3
-; SSE-NEXT:    paddq %xmm4, %xmm2
-; SSE-NEXT:    paddq %xmm3, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    paddq %xmm4, %xmm0
 ; SSE-NEXT:    paddq %xmm2, %xmm0
+; SSE-NEXT:    paddq %xmm7, %xmm3
+; SSE-NEXT:    paddq %xmm5, %xmm1
+; SSE-NEXT:    paddq %xmm3, %xmm1
+; SSE-NEXT:    paddq %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddq %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
@@ -165,7 +165,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -344,13 +344,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE-LABEL: test_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddd %xmm3, %xmm1
-; SSE-NEXT:    paddd %xmm2, %xmm1
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddd %xmm2, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-SLOW-LABEL: test_v16i32:
@@ -358,8 +358,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -373,8 +373,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
@@ -416,14 +416,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE-LABEL: test_v32i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddd %xmm6, %xmm2
-; SSE-NEXT:    paddd %xmm7, %xmm3
-; SSE-NEXT:    paddd %xmm5, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm3
-; SSE-NEXT:    paddd %xmm4, %xmm2
-; SSE-NEXT:    paddd %xmm3, %xmm2
-; SSE-NEXT:    paddd %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    paddd %xmm4, %xmm0
 ; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm7, %xmm3
+; SSE-NEXT:    paddd %xmm5, %xmm1
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    paddd %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -432,16 +432,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-SLOW-LABEL: test_v32i32:
 ; AVX1-SLOW:       # %bb.0:
 ; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -453,16 +453,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-FAST-LABEL: test_v32i32:
 ; AVX1-FAST:       # %bb.0:
 ; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
@@ -473,7 +473,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -746,16 +746,16 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddw %xmm3, %xmm1
-; SSE-NEXT:    paddw %xmm2, %xmm1
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddw %xmm2, %xmm0
 ; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -764,8 +764,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -782,8 +782,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -834,14 +834,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddw %xmm6, %xmm2
-; SSE-NEXT:    paddw %xmm7, %xmm3
-; SSE-NEXT:    paddw %xmm5, %xmm3
-; SSE-NEXT:    paddw %xmm1, %xmm3
-; SSE-NEXT:    paddw %xmm4, %xmm2
-; SSE-NEXT:    paddw %xmm3, %xmm2
-; SSE-NEXT:    paddw %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    paddw %xmm4, %xmm0
 ; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    paddw %xmm7, %xmm3
+; SSE-NEXT:    paddw %xmm5, %xmm1
+; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    paddw %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -854,16 +854,16 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-SLOW-LABEL: test_v64i16:
 ; AVX1-SLOW:       # %bb.0:
 ; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm0, %xmm5
+; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm5, %xmm4
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT:    vpaddw %xmm0, %xmm4, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -878,16 +878,16 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-FAST-LABEL: test_v64i16:
 ; AVX1-FAST:       # %bb.0:
 ; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
+; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm0, %xmm5
+; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm5, %xmm4
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpaddw %xmm0, %xmm4, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -901,7 +901,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
@@ -1231,13 +1231,13 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE-LABEL: test_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddb %xmm3, %xmm1
-; SSE-NEXT:    paddb %xmm2, %xmm1
-; SSE-NEXT:    paddb %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    paddb %xmm2, %xmm0
 ; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    psadbw %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    paddb %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    psadbw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -1246,8 +1246,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -1293,17 +1293,17 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE-LABEL: test_v128i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    paddb %xmm7, %xmm3
-; SSE-NEXT:    paddb %xmm5, %xmm3
-; SSE-NEXT:    paddb %xmm1, %xmm3
+; SSE-NEXT:    paddb %xmm5, %xmm1
+; SSE-NEXT:    paddb %xmm3, %xmm1
 ; SSE-NEXT:    paddb %xmm6, %xmm2
-; SSE-NEXT:    paddb %xmm4, %xmm2
-; SSE-NEXT:    paddb %xmm3, %xmm2
-; SSE-NEXT:    paddb %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    paddb %xmm4, %xmm0
 ; SSE-NEXT:    paddb %xmm2, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    psadbw %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    paddb %xmm0, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    psadbw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -1313,13 +1313,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpaddb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -1332,7 +1332,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
index 6b77a18e9e0f7..f68cf441112fd 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll
@@ -747,10 +747,10 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; SSE-LABEL: trunc_v64i8_v64i1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    psllw $7, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -783,8 +783,8 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1284,10 +1284,10 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -1295,9 +1295,9 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) {
 ; SSE41-LABEL: icmp0_v8i64_v8i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -1353,10 +1353,10 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -1364,9 +1364,9 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) {
 ; SSE41-LABEL: icmp0_v16i32_v16i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -1403,10 +1403,10 @@ define i1 @icmp0_v32i16_v32i1(<32 x i16>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -1414,9 +1414,9 @@ define i1 @icmp0_v32i16_v32i1(<32 x i16>) {
 ; SSE41-LABEL: icmp0_v32i16_v32i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -1482,10 +1482,10 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; SSE2-LABEL: icmp0_v64i8_v64i1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
@@ -1494,9 +1494,9 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; SSE41-LABEL: icmp0_v64i8_v64i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -1525,8 +1525,8 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2199,10 +2199,10 @@ define i1 @icmp1_v8i64_v8i1(<8 x i64>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -2211,10 +2211,10 @@ define i1 @icmp1_v8i64_v8i1(<8 x i64>) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm4, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm4, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -2280,10 +2280,10 @@ define i1 @icmp1_v16i32_v16i1(<16 x i32>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -2292,10 +2292,10 @@ define i1 @icmp1_v16i32_v16i1(<16 x i32>) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm4, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm4, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -2340,10 +2340,10 @@ define i1 @icmp1_v32i16_v32i1(<32 x i16>) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm4, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -2352,10 +2352,10 @@ define i1 @icmp1_v32i16_v32i1(<32 x i16>) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
 ; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm4, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    psubb %xmm4, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -2430,10 +2430,10 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) {
 ; SSE2-LABEL: icmp1_v64i8_v64i1:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpw $-1, %ax
 ; SSE2-NEXT:    sete %al
@@ -2442,11 +2442,11 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) {
 ; SSE41-LABEL: icmp1_v64i8_v64i1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    psubb %xmm3, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    psubb %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -2482,8 +2482,8 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -3126,9 +3126,9 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>, <8 x i64>) {
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
@@ -3144,9 +3144,9 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>, <8 x i64>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
 ; AVX1-NEXT:    sete %al
@@ -3201,9 +3201,9 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) {
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
@@ -3219,9 +3219,9 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
 ; AVX1-NEXT:    sete %al
@@ -3257,9 +3257,9 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) {
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
 ; SSE-NEXT:    sete %al
@@ -3275,9 +3275,9 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
 ; AVX1-NEXT:    sete %al
@@ -3341,10 +3341,10 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm7, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    cmpw $-1, %ax
@@ -3355,6 +3355,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
@@ -3363,7 +3364,6 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    cmpw $-1, %ax
 ; AVX1-NEXT:    sete %al
@@ -3389,7 +3389,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT:    vpand %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
index 124c7dffbdfc9..f68a292d02b59 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll
@@ -88,11 +88,11 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; SSE-LABEL: test_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
 ; SSE-NEXT:    testq %rax, %rax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -145,14 +145,14 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; SSE-LABEL: test_v16i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    testq %rax, %rax
 ; SSE-NEXT:    setne %al
@@ -161,7 +161,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -176,7 +176,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -324,13 +324,13 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; SSE-LABEL: test_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
@@ -389,14 +389,14 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; SSE-LABEL: test_v32i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -407,7 +407,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -424,7 +424,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -619,16 +619,16 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sete %al
 ; SSE-NEXT:    retq
@@ -693,14 +693,14 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -714,7 +714,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -733,7 +733,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -980,19 +980,19 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; SSE-LABEL: test_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
@@ -1063,14 +1063,14 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; SSE-LABEL: test_v128i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -1087,7 +1087,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -1108,7 +1108,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll
index 20a2cdd5349b5..7ed5f11ce33fd 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll
@@ -74,11 +74,11 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE-LABEL: test_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i64:
@@ -122,21 +122,21 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE-LABEL: test_v16i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -149,7 +149,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -273,13 +273,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE-LABEL: test_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -329,14 +329,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE-LABEL: test_v32i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -345,7 +345,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -360,7 +360,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -536,16 +536,16 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -605,14 +605,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -625,7 +625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -643,7 +643,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
@@ -870,19 +870,19 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE-LABEL: test_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
 ; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -948,14 +948,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE-LABEL: test_v128i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -971,7 +971,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
@@ -991,7 +991,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 403502d292d10..0103b7622dc3e 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -917,13 +917,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE41-LABEL: test_v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmulld %xmm3, %xmm1
-; SSE41-NEXT:    pmulld %xmm2, %xmm1
-; SSE41-NEXT:    pmulld %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    movd %xmm0, %eax
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -931,8 +931,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1007,14 +1007,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE41-LABEL: test_v32i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pmulld %xmm6, %xmm2
-; SSE41-NEXT:    pmulld %xmm7, %xmm3
-; SSE41-NEXT:    pmulld %xmm5, %xmm3
-; SSE41-NEXT:    pmulld %xmm1, %xmm3
-; SSE41-NEXT:    pmulld %xmm4, %xmm2
-; SSE41-NEXT:    pmulld %xmm3, %xmm2
-; SSE41-NEXT:    pmulld %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE41-NEXT:    pmulld %xmm4, %xmm0
 ; SSE41-NEXT:    pmulld %xmm2, %xmm0
+; SSE41-NEXT:    pmulld %xmm7, %xmm3
+; SSE41-NEXT:    pmulld %xmm5, %xmm1
+; SSE41-NEXT:    pmulld %xmm3, %xmm1
+; SSE41-NEXT:    pmulld %xmm0, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
 ; SSE41-NEXT:    movd %xmm1, %eax
@@ -1023,16 +1023,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmulld %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1044,7 +1044,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
@@ -1220,16 +1220,16 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pmullw %xmm3, %xmm1
-; SSE-NEXT:    pmullw %xmm2, %xmm1
-; SSE-NEXT:    pmullw %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pmullw %xmm2, %xmm0
 ; SSE-NEXT:    pmullw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pmullw %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pmullw %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -1238,8 +1238,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1342,14 +1342,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pmullw %xmm6, %xmm2
-; SSE-NEXT:    pmullw %xmm7, %xmm3
-; SSE-NEXT:    pmullw %xmm5, %xmm3
-; SSE-NEXT:    pmullw %xmm1, %xmm3
-; SSE-NEXT:    pmullw %xmm4, %xmm2
-; SSE-NEXT:    pmullw %xmm3, %xmm2
-; SSE-NEXT:    pmullw %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pmullw %xmm4, %xmm0
 ; SSE-NEXT:    pmullw %xmm2, %xmm0
+; SSE-NEXT:    pmullw %xmm7, %xmm3
+; SSE-NEXT:    pmullw %xmm5, %xmm1
+; SSE-NEXT:    pmullw %xmm3, %xmm1
+; SSE-NEXT:    pmullw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pmullw %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pmullw %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -1362,16 +1362,16 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1386,7 +1386,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -1442,8 +1442,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1462,8 +1462,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1650,9 +1650,9 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; SSE2-NEXT:    pmullw %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm1, %xmm2
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -1671,8 +1671,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; SSE41-NEXT:    pmullw %xmm2, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm1, %xmm0
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
+; SSE41-NEXT:    pmullw %xmm1, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -1691,8 +1691,8 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -1753,19 +1753,19 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-NEXT:    pmullw %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm3, %xmm4
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm4, %xmm1
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
-; SSE2-NEXT:    pmullw %xmm1, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm0
+; SSE2-NEXT:    pmullw %xmm2, %xmm0
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -1784,16 +1784,16 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE41-NEXT:    pmullw %xmm4, %xmm3
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm3, %xmm1
 ; SSE41-NEXT:    pmullw %xmm4, %xmm1
+; SSE41-NEXT:    pmullw %xmm3, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm2
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -1813,17 +1813,17 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1841,8 +1841,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -1904,8 +1904,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -1927,8 +1927,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -1955,39 +1955,39 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-NEXT:    pmullw %xmm8, %xmm7
 ; SSE2-NEXT:    movdqa %xmm3, %xmm8
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm7, %xmm8
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm8, %xmm3
+; SSE2-NEXT:    pmullw %xmm7, %xmm3
 ; SSE2-NEXT:    movdqa %xmm5, %xmm7
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm7, %xmm5
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm5, %xmm7
-; SSE2-NEXT:    pmullw %xmm3, %xmm7
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm7, %xmm1
+; SSE2-NEXT:    pmullw %xmm5, %xmm1
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm6, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm6
 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm6, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm2
+; SSE2-NEXT:    pmullw %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm4
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm4, %xmm3
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
-; SSE2-NEXT:    pmullw %xmm1, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pmullw %xmm3, %xmm0
+; SSE2-NEXT:    pmullw %xmm4, %xmm0
+; SSE2-NEXT:    pmullw %xmm2, %xmm0
+; SSE2-NEXT:    pmullw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -2006,32 +2006,32 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE41-NEXT:    pmullw %xmm8, %xmm7
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm7, %xmm3
 ; SSE41-NEXT:    pmullw %xmm8, %xmm3
+; SSE41-NEXT:    pmullw %xmm7, %xmm3
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm7, %xmm5
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm7, %xmm1
 ; SSE41-NEXT:    pmullw %xmm5, %xmm1
 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
-; SSE41-NEXT:    pmullw %xmm7, %xmm1
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm6
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT:    pmullw %xmm6, %xmm2
 ; SSE41-NEXT:    pmullw %xmm3, %xmm2
+; SSE41-NEXT:    pmullw %xmm6, %xmm2
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE41-NEXT:    pmullw %xmm3, %xmm4
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pmullw %xmm3, %xmm0
 ; SSE41-NEXT:    pmullw %xmm4, %xmm0
 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -2051,8 +2051,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -2060,26 +2060,26 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 ; AVX1-NEXT:    vpmullw %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -2097,16 +2097,16 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX2-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX2-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
@@ -2127,8 +2127,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BW-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
@@ -2151,8 +2151,8 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
 ; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm1, %zmm1
 ; AVX512BWVL-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm2, %zmm1
 ; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
+; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
@@ -2177,17 +2177,17 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2209,17 +2209,17 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
 ; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
 ; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm4, %ymm2
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
 ; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
 ; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm3, %ymm1
-; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index 2eb52a781b8be..788874b559b6f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -741,10 +741,10 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; SSE-LABEL: trunc_v64i8_v64i1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    psllw $7, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
@@ -777,8 +777,8 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1582,10 +1582,10 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; SSE-NEXT:    pxor %xmm4, %xmm4
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    testl %eax, %eax
@@ -1597,13 +1597,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
 ; AVX1-NEXT:    setne %al
@@ -1631,8 +1631,8 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2418,10 +2418,10 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm7, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    testl %eax, %eax
@@ -2432,6 +2432,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
@@ -2440,7 +2441,6 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    testl %eax, %eax
 ; AVX1-NEXT:    setne %al
@@ -2467,7 +2467,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index f048911feeb52..fcb0ab6090398 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -69,11 +69,11 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; SSE2-LABEL: test_v8i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -81,9 +81,9 @@ define i1 @test_v8i64(<8 x i64> %a0) {
 ; SSE41-LABEL: test_v8i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -120,15 +120,15 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; SSE2-LABEL: test_v16i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
@@ -136,20 +136,20 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; SSE41-LABEL: test_v16i64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm7, %xmm3
-; SSE41-NEXT:    por %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm6, %xmm2
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    ptest %xmm2, %xmm2
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest %ymm0, %ymm0
 ; AVX1-NEXT:    setne %al
@@ -159,7 +159,7 @@ define i1 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vptest %ymm0, %ymm0
 ; AVX2-NEXT:    setne %al
@@ -262,11 +262,11 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; SSE2-LABEL: test_v16i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
@@ -274,9 +274,9 @@ define i1 @test_v16i32(<16 x i32> %a0) {
 ; SSE41-LABEL: test_v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
@@ -313,15 +313,15 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; SSE2-LABEL: test_v32i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -329,20 +329,20 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; SSE41-LABEL: test_v32i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm7, %xmm3
-; SSE41-NEXT:    por %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm6, %xmm2
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    ptest %xmm2, %xmm2
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest %ymm0, %ymm0
 ; AVX1-NEXT:    sete %al
@@ -352,7 +352,7 @@ define i1 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vptest %ymm0, %ymm0
 ; AVX2-NEXT:    sete %al
@@ -474,11 +474,11 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -486,9 +486,9 @@ define i1 @test_v32i16(<32 x i16> %a0) {
 ; SSE41-LABEL: test_v32i16:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
@@ -525,15 +525,15 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
@@ -541,20 +541,20 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; SSE41-LABEL: test_v64i16:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm7, %xmm3
-; SSE41-NEXT:    por %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm6, %xmm2
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    ptest %xmm2, %xmm2
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest %ymm0, %ymm0
 ; AVX1-NEXT:    setne %al
@@ -564,7 +564,7 @@ define i1 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vptest %ymm0, %ymm0
 ; AVX2-NEXT:    setne %al
@@ -705,11 +705,11 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-LABEL: test_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
@@ -717,9 +717,9 @@ define i1 @test_v64i8(<64 x i8> %a0) {
 ; SSE41-LABEL: test_v64i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm3, %xmm1
-; SSE41-NEXT:    por %xmm2, %xmm1
-; SSE41-NEXT:    por %xmm0, %xmm1
-; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    retq
 ;
@@ -756,15 +756,15 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-LABEL: test_v128i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm2, %xmm0
-; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
@@ -772,20 +772,20 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; SSE41-LABEL: test_v128i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm7, %xmm3
-; SSE41-NEXT:    por %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm6, %xmm2
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    ptest %xmm2, %xmm2
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest %ymm0, %ymm0
 ; AVX1-NEXT:    sete %al
@@ -795,7 +795,7 @@ define i1 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vptest %ymm0, %ymm0
 ; AVX2-NEXT:    sete %al
@@ -964,14 +964,14 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; SSE2-LABEL: mask_v128i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    por %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    psllw $7, %xmm2
-; SSE2-NEXT:    pmovmskb %xmm2, %eax
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    psllw $7, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; SSE2-NEXT:    xorl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
@@ -980,20 +980,20 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; SSE41-LABEL: mask_v128i8:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    por %xmm7, %xmm3
-; SSE41-NEXT:    por %xmm5, %xmm3
-; SSE41-NEXT:    por %xmm1, %xmm3
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    por %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm6, %xmm2
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    por %xmm3, %xmm2
-; SSE41-NEXT:    por %xmm0, %xmm2
-; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE41-NEXT:    por %xmm4, %xmm0
+; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    sete %al
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: mask_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
 ; AVX1-NEXT:    sete %al
@@ -1003,7 +1003,7 @@ define i1 @mask_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: mask_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673]
 ; AVX2-NEXT:    vptest %ymm1, %ymm0
@@ -1091,11 +1091,11 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movd %xmm0, %edx
-; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    testb $1, %dl
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    orl %ecx, %eax
+; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB27_2
 ; SSE2-NEXT:  # %bb.1:
 ; SSE2-NEXT:    xorl %eax, %eax
@@ -1111,10 +1111,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE41-NEXT:    pxor %xmm0, %xmm1
 ; SSE41-NEXT:    pextrd $1, %xmm1, %eax
 ; SSE41-NEXT:    movd %xmm1, %ecx
-; SSE41-NEXT:    pextrd $2, %xmm1, %edx
-; SSE41-NEXT:    orl %eax, %edx
-; SSE41-NEXT:    orl %ecx, %edx
-; SSE41-NEXT:    testb $1, %dl
+; SSE41-NEXT:    orl %eax, %ecx
+; SSE41-NEXT:    pextrd $2, %xmm1, %eax
+; SSE41-NEXT:    orl %ecx, %eax
+; SSE41-NEXT:    testb $1, %al
 ; SSE41-NEXT:    je .LBB27_2
 ; SSE41-NEXT:  # %bb.1:
 ; SSE41-NEXT:    xorl %eax, %eax
@@ -1130,10 +1130,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX1-NEXT:    vmovd %xmm0, %ecx
-; AVX1-NEXT:    vpextrd $2, %xmm0, %edx
-; AVX1-NEXT:    orl %eax, %edx
-; AVX1-NEXT:    orl %ecx, %edx
-; AVX1-NEXT:    testb $1, %dl
+; AVX1-NEXT:    orl %eax, %ecx
+; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    orl %ecx, %eax
+; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    je .LBB27_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    xorl %eax, %eax
@@ -1149,10 +1149,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX2-NEXT:    vmovd %xmm0, %ecx
-; AVX2-NEXT:    vpextrd $2, %xmm0, %edx
-; AVX2-NEXT:    orl %eax, %edx
-; AVX2-NEXT:    orl %ecx, %edx
-; AVX2-NEXT:    testb $1, %dl
+; AVX2-NEXT:    orl %eax, %ecx
+; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX2-NEXT:    orl %ecx, %eax
+; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    je .LBB27_2
 ; AVX2-NEXT:  # %bb.1:
 ; AVX2-NEXT:    xorl %eax, %eax

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll
index 17ffe81fb4860..28309f619b5aa 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll
@@ -74,11 +74,11 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE-LABEL: test_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i64:
@@ -122,21 +122,21 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE-LABEL: test_v16i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    por %xmm5, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
@@ -149,7 +149,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -273,13 +273,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE-LABEL: test_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -329,14 +329,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE-LABEL: test_v32i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    por %xmm5, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -345,7 +345,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
@@ -360,7 +360,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -536,16 +536,16 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -605,14 +605,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    por %xmm5, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -625,7 +625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
@@ -643,7 +643,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -870,19 +870,19 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE-LABEL: test_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -948,14 +948,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE-LABEL: test_v128i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    por %xmm5, %xmm3
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    por %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -971,7 +971,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
@@ -991,7 +991,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
index 839ca7975ff83..21e6f3368c90f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -931,13 +931,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE4-LABEL: test_v16i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
-; SSE4-NEXT:    pmaxsd %xmm2, %xmm1
-; SSE4-NEXT:    pmaxsd %xmm0, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
 ; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE4-NEXT:    pmaxsd %xmm0, %xmm1
-; SSE4-NEXT:    movd %xmm1, %eax
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -945,8 +945,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxsd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1041,14 +1041,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE4-LABEL: test_v32i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsd %xmm6, %xmm2
-; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
-; SSE4-NEXT:    pmaxsd %xmm5, %xmm3
-; SSE4-NEXT:    pmaxsd %xmm1, %xmm3
-; SSE4-NEXT:    pmaxsd %xmm4, %xmm2
-; SSE4-NEXT:    pmaxsd %xmm3, %xmm2
-; SSE4-NEXT:    pmaxsd %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmaxsd %xmm4, %xmm0
 ; SSE4-NEXT:    pmaxsd %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsd %xmm7, %xmm3
+; SSE4-NEXT:    pmaxsd %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm3, %xmm1
+; SSE4-NEXT:    pmaxsd %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmaxsd %xmm1, %xmm0
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE4-NEXT:    pmaxsd %xmm0, %xmm1
 ; SSE4-NEXT:    movd %xmm1, %eax
@@ -1057,16 +1057,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxsd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmaxsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmaxsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsd %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1078,7 +1078,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxsd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxsd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
@@ -1288,26 +1288,26 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm2, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v32i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE4-NEXT:    pmaxsw %xmm2, %xmm1
-; SSE4-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT:    phminposuw %xmm1, %xmm0
+; SSE4-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorl $32767, %eax # imm = 0x7FFF
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1318,8 +1318,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxsw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1362,14 +1362,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pmaxsw %xmm6, %xmm2
-; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
-; SSE2-NEXT:    pmaxsw %xmm5, %xmm3
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm3
-; SSE2-NEXT:    pmaxsw %xmm4, %xmm2
-; SSE2-NEXT:    pmaxsw %xmm3, %xmm2
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
@@ -1382,14 +1382,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE4-LABEL: test_v64i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsw %xmm7, %xmm3
-; SSE4-NEXT:    pmaxsw %xmm5, %xmm3
-; SSE4-NEXT:    pmaxsw %xmm1, %xmm3
+; SSE4-NEXT:    pmaxsw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsw %xmm3, %xmm1
 ; SSE4-NEXT:    pmaxsw %xmm6, %xmm2
-; SSE4-NEXT:    pmaxsw %xmm4, %xmm2
-; SSE4-NEXT:    pmaxsw %xmm3, %xmm2
-; SSE4-NEXT:    pmaxsw %xmm0, %xmm2
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT:    phminposuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorl $32767, %eax # imm = 0x7FFF
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1401,13 +1401,13 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxsw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpmaxsw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaxsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpmaxsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1419,7 +1419,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxsw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxsw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
@@ -1836,13 +1836,13 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE4-LABEL: test_v64i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
-; SSE4-NEXT:    pmaxsb %xmm2, %xmm1
-; SSE4-NEXT:    pmaxsb %xmm0, %xmm1
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
-; SSE4-NEXT:    pminub %xmm1, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorb $127, %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -1853,8 +1853,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxsb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -1970,17 +1970,17 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE4-LABEL: test_v128i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxsb %xmm7, %xmm3
-; SSE4-NEXT:    pmaxsb %xmm5, %xmm3
-; SSE4-NEXT:    pmaxsb %xmm1, %xmm3
+; SSE4-NEXT:    pmaxsb %xmm5, %xmm1
+; SSE4-NEXT:    pmaxsb %xmm3, %xmm1
 ; SSE4-NEXT:    pmaxsb %xmm6, %xmm2
-; SSE4-NEXT:    pmaxsb %xmm4, %xmm2
-; SSE4-NEXT:    pmaxsb %xmm3, %xmm2
-; SSE4-NEXT:    pmaxsb %xmm0, %xmm2
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
-; SSE4-NEXT:    pminub %xmm2, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm4, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm2, %xmm0
+; SSE4-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorb $127, %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -1992,13 +1992,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxsb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpmaxsb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpmaxsb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxsb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -2012,7 +2012,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxsb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxsb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
index a0eb00325ebd7..967cb00636d52 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -931,13 +931,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE4-LABEL: test_v16i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsd %xmm3, %xmm1
-; SSE4-NEXT:    pminsd %xmm2, %xmm1
-; SSE4-NEXT:    pminsd %xmm0, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pminsd %xmm2, %xmm0
 ; SSE4-NEXT:    pminsd %xmm1, %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE4-NEXT:    pminsd %xmm0, %xmm1
-; SSE4-NEXT:    movd %xmm1, %eax
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
+; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -945,8 +945,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminsd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1041,14 +1041,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE4-LABEL: test_v32i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsd %xmm6, %xmm2
-; SSE4-NEXT:    pminsd %xmm7, %xmm3
-; SSE4-NEXT:    pminsd %xmm5, %xmm3
-; SSE4-NEXT:    pminsd %xmm1, %xmm3
-; SSE4-NEXT:    pminsd %xmm4, %xmm2
-; SSE4-NEXT:    pminsd %xmm3, %xmm2
-; SSE4-NEXT:    pminsd %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pminsd %xmm4, %xmm0
 ; SSE4-NEXT:    pminsd %xmm2, %xmm0
+; SSE4-NEXT:    pminsd %xmm7, %xmm3
+; SSE4-NEXT:    pminsd %xmm5, %xmm1
+; SSE4-NEXT:    pminsd %xmm3, %xmm1
+; SSE4-NEXT:    pminsd %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pminsd %xmm1, %xmm0
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE4-NEXT:    pminsd %xmm0, %xmm1
 ; SSE4-NEXT:    movd %xmm1, %eax
@@ -1057,16 +1057,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpminsd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpminsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpminsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpminsd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsd %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1078,7 +1078,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminsd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminsd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
@@ -1288,26 +1288,26 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pminsw %xmm3, %xmm1
-; SSE2-NEXT:    pminsw %xmm2, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; SSE2-NEXT:    pminsw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    pminsw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v32i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsw %xmm3, %xmm1
-; SSE4-NEXT:    pminsw %xmm2, %xmm1
-; SSE4-NEXT:    pminsw %xmm0, %xmm1
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT:    phminposuw %xmm1, %xmm0
+; SSE4-NEXT:    pminsw %xmm2, %xmm0
+; SSE4-NEXT:    pminsw %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1318,8 +1318,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminsw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1362,14 +1362,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pminsw %xmm6, %xmm2
-; SSE2-NEXT:    pminsw %xmm7, %xmm3
-; SSE2-NEXT:    pminsw %xmm5, %xmm3
-; SSE2-NEXT:    pminsw %xmm1, %xmm3
-; SSE2-NEXT:    pminsw %xmm4, %xmm2
-; SSE2-NEXT:    pminsw %xmm3, %xmm2
-; SSE2-NEXT:    pminsw %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pminsw %xmm4, %xmm0
 ; SSE2-NEXT:    pminsw %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm7, %xmm3
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
+; SSE2-NEXT:    pminsw %xmm3, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
@@ -1382,14 +1382,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE4-LABEL: test_v64i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsw %xmm7, %xmm3
-; SSE4-NEXT:    pminsw %xmm5, %xmm3
-; SSE4-NEXT:    pminsw %xmm1, %xmm3
+; SSE4-NEXT:    pminsw %xmm5, %xmm1
+; SSE4-NEXT:    pminsw %xmm3, %xmm1
 ; SSE4-NEXT:    pminsw %xmm6, %xmm2
-; SSE4-NEXT:    pminsw %xmm4, %xmm2
-; SSE4-NEXT:    pminsw %xmm3, %xmm2
-; SSE4-NEXT:    pminsw %xmm0, %xmm2
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT:    phminposuw %xmm2, %xmm0
+; SSE4-NEXT:    pminsw %xmm4, %xmm0
+; SSE4-NEXT:    pminsw %xmm2, %xmm0
+; SSE4-NEXT:    pminsw %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    xorl $32768, %eax # imm = 0x8000
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1401,13 +1401,13 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminsw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpminsw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpminsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpminsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpminsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
@@ -1419,7 +1419,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminsw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminsw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
@@ -1836,13 +1836,13 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE4-LABEL: test_v64i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsb %xmm3, %xmm1
-; SSE4-NEXT:    pminsb %xmm2, %xmm1
-; SSE4-NEXT:    pminsb %xmm0, %xmm1
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
-; SSE4-NEXT:    pminub %xmm1, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    addb $-128, %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -1853,8 +1853,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminsb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -1970,17 +1970,17 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE4-LABEL: test_v128i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminsb %xmm7, %xmm3
-; SSE4-NEXT:    pminsb %xmm5, %xmm3
-; SSE4-NEXT:    pminsb %xmm1, %xmm3
+; SSE4-NEXT:    pminsb %xmm5, %xmm1
+; SSE4-NEXT:    pminsb %xmm3, %xmm1
 ; SSE4-NEXT:    pminsb %xmm6, %xmm2
-; SSE4-NEXT:    pminsb %xmm4, %xmm2
-; SSE4-NEXT:    pminsb %xmm3, %xmm2
-; SSE4-NEXT:    pminsb %xmm0, %xmm2
-; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
-; SSE4-NEXT:    pminub %xmm2, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pminsb %xmm4, %xmm0
+; SSE4-NEXT:    pminsb %xmm2, %xmm0
+; SSE4-NEXT:    pminsb %xmm1, %xmm0
+; SSE4-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    addb $-128, %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -1992,13 +1992,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminsb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpminsb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpminsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpminsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpminsb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminsb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
@@ -2012,7 +2012,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminsb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminsb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index cb3634508e4ce..79f71276a1897 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -1065,13 +1065,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE4-LABEL: test_v16i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxud %xmm3, %xmm1
-; SSE4-NEXT:    pmaxud %xmm2, %xmm1
-; SSE4-NEXT:    pmaxud %xmm0, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmaxud %xmm2, %xmm0
 ; SSE4-NEXT:    pmaxud %xmm1, %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE4-NEXT:    pmaxud %xmm0, %xmm1
-; SSE4-NEXT:    movd %xmm1, %eax
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
+; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -1079,8 +1079,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxud %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1202,14 +1202,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE4-LABEL: test_v32i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxud %xmm6, %xmm2
-; SSE4-NEXT:    pmaxud %xmm7, %xmm3
-; SSE4-NEXT:    pmaxud %xmm5, %xmm3
-; SSE4-NEXT:    pmaxud %xmm1, %xmm3
-; SSE4-NEXT:    pmaxud %xmm4, %xmm2
-; SSE4-NEXT:    pmaxud %xmm3, %xmm2
-; SSE4-NEXT:    pmaxud %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pmaxud %xmm4, %xmm0
 ; SSE4-NEXT:    pmaxud %xmm2, %xmm0
+; SSE4-NEXT:    pmaxud %xmm7, %xmm3
+; SSE4-NEXT:    pmaxud %xmm5, %xmm1
+; SSE4-NEXT:    pmaxud %xmm3, %xmm1
+; SSE4-NEXT:    pmaxud %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pmaxud %xmm1, %xmm0
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE4-NEXT:    pmaxud %xmm0, %xmm1
 ; SSE4-NEXT:    movd %xmm1, %eax
@@ -1218,16 +1218,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpmaxud %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmaxud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpmaxud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1239,7 +1239,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxud %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxud %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
@@ -1529,11 +1529,11 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE4-LABEL: test_v32i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
-; SSE4-NEXT:    pmaxuw %xmm2, %xmm1
-; SSE4-NEXT:    pmaxuw %xmm0, %xmm1
-; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT:    pxor %xmm1, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    notl %eax
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1544,8 +1544,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1634,15 +1634,15 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE4-LABEL: test_v64i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxuw %xmm7, %xmm3
-; SSE4-NEXT:    pmaxuw %xmm5, %xmm3
-; SSE4-NEXT:    pmaxuw %xmm1, %xmm3
+; SSE4-NEXT:    pmaxuw %xmm5, %xmm1
+; SSE4-NEXT:    pmaxuw %xmm3, %xmm1
 ; SSE4-NEXT:    pmaxuw %xmm6, %xmm2
-; SSE4-NEXT:    pmaxuw %xmm4, %xmm2
-; SSE4-NEXT:    pmaxuw %xmm3, %xmm2
-; SSE4-NEXT:    pmaxuw %xmm0, %xmm2
-; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT:    pxor %xmm2, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm4, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm2, %xmm0
+; SSE4-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    notl %eax
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1654,13 +1654,13 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxuw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpmaxuw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaxuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpmaxuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxuw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1673,7 +1673,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxuw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxuw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
@@ -1996,33 +1996,33 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-LABEL: test_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pmaxub %xmm3, %xmm1
-; SSE2-NEXT:    pmaxub %xmm2, %xmm1
-; SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pmaxub %xmm2, %xmm0
 ; SSE2-NEXT:    pmaxub %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    pmaxub %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    pmaxub %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v64i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxub %xmm3, %xmm1
-; SSE4-NEXT:    pmaxub %xmm2, %xmm1
-; SSE4-NEXT:    pmaxub %xmm0, %xmm1
-; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT:    pxor %xmm1, %xmm0
-; SSE4-NEXT:    movdqa %xmm0, %xmm1
-; SSE4-NEXT:    psrlw $8, %xmm1
-; SSE4-NEXT:    pminub %xmm0, %xmm1
-; SSE4-NEXT:    phminposuw %xmm1, %xmm0
+; SSE4-NEXT:    pmaxub %xmm2, %xmm0
+; SSE4-NEXT:    pmaxub %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    psrlw $8, %xmm0
+; SSE4-NEXT:    pminub %xmm1, %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -2033,8 +2033,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpmaxub %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
@@ -2101,14 +2101,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-LABEL: test_v128i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pmaxub %xmm6, %xmm2
-; SSE2-NEXT:    pmaxub %xmm7, %xmm3
-; SSE2-NEXT:    pmaxub %xmm5, %xmm3
-; SSE2-NEXT:    pmaxub %xmm1, %xmm3
-; SSE2-NEXT:    pmaxub %xmm4, %xmm2
-; SSE2-NEXT:    pmaxub %xmm3, %xmm2
-; SSE2-NEXT:    pmaxub %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pmaxub %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxub %xmm2, %xmm0
+; SSE2-NEXT:    pmaxub %xmm7, %xmm3
+; SSE2-NEXT:    pmaxub %xmm5, %xmm1
+; SSE2-NEXT:    pmaxub %xmm3, %xmm1
+; SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pmaxub %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pmaxub %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
@@ -2124,18 +2124,18 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE4-LABEL: test_v128i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pmaxub %xmm7, %xmm3
-; SSE4-NEXT:    pmaxub %xmm5, %xmm3
-; SSE4-NEXT:    pmaxub %xmm1, %xmm3
+; SSE4-NEXT:    pmaxub %xmm5, %xmm1
+; SSE4-NEXT:    pmaxub %xmm3, %xmm1
 ; SSE4-NEXT:    pmaxub %xmm6, %xmm2
-; SSE4-NEXT:    pmaxub %xmm4, %xmm2
-; SSE4-NEXT:    pmaxub %xmm3, %xmm2
-; SSE4-NEXT:    pmaxub %xmm0, %xmm2
-; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE4-NEXT:    pxor %xmm2, %xmm0
-; SSE4-NEXT:    movdqa %xmm0, %xmm1
-; SSE4-NEXT:    psrlw $8, %xmm1
-; SSE4-NEXT:    pminub %xmm0, %xmm1
-; SSE4-NEXT:    phminposuw %xmm1, %xmm0
+; SSE4-NEXT:    pmaxub %xmm4, %xmm0
+; SSE4-NEXT:    pmaxub %xmm2, %xmm0
+; SSE4-NEXT:    pmaxub %xmm1, %xmm0
+; SSE4-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    movdqa %xmm1, %xmm0
+; SSE4-NEXT:    psrlw $8, %xmm0
+; SSE4-NEXT:    pminub %xmm1, %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
@@ -2147,13 +2147,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpmaxub %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpmaxub %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaxub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpmaxub %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmaxub %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
@@ -2168,7 +2168,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmaxub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpmaxub %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpmaxub %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index e00380f7108ed..a011755771a16 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -1069,13 +1069,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE4-LABEL: test_v16i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminud %xmm3, %xmm1
-; SSE4-NEXT:    pminud %xmm2, %xmm1
-; SSE4-NEXT:    pminud %xmm0, %xmm1
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pminud %xmm2, %xmm0
 ; SSE4-NEXT:    pminud %xmm1, %xmm0
-; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE4-NEXT:    pminud %xmm0, %xmm1
-; SSE4-NEXT:    movd %xmm1, %eax
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE4-NEXT:    pminud %xmm1, %xmm0
+; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -1083,8 +1083,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminud %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1206,14 +1206,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE4-LABEL: test_v32i32:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminud %xmm6, %xmm2
-; SSE4-NEXT:    pminud %xmm7, %xmm3
-; SSE4-NEXT:    pminud %xmm5, %xmm3
-; SSE4-NEXT:    pminud %xmm1, %xmm3
-; SSE4-NEXT:    pminud %xmm4, %xmm2
-; SSE4-NEXT:    pminud %xmm3, %xmm2
-; SSE4-NEXT:    pminud %xmm0, %xmm2
-; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE4-NEXT:    pminud %xmm4, %xmm0
 ; SSE4-NEXT:    pminud %xmm2, %xmm0
+; SSE4-NEXT:    pminud %xmm7, %xmm3
+; SSE4-NEXT:    pminud %xmm5, %xmm1
+; SSE4-NEXT:    pminud %xmm3, %xmm1
+; SSE4-NEXT:    pminud %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE4-NEXT:    pminud %xmm1, %xmm0
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE4-NEXT:    pminud %xmm0, %xmm1
 ; SSE4-NEXT:    movd %xmm1, %eax
@@ -1222,16 +1222,16 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpminud %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpminud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpminud %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminud %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
@@ -1243,7 +1243,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminud %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminud %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
@@ -1507,9 +1507,9 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE4-LABEL: test_v32i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminuw %xmm3, %xmm1
-; SSE4-NEXT:    pminuw %xmm2, %xmm1
-; SSE4-NEXT:    pminuw %xmm0, %xmm1
-; SSE4-NEXT:    phminposuw %xmm1, %xmm0
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE4-NEXT:    retq
@@ -1519,8 +1519,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminuw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1597,13 +1597,13 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE4-LABEL: test_v64i16:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminuw %xmm7, %xmm3
-; SSE4-NEXT:    pminuw %xmm5, %xmm3
-; SSE4-NEXT:    pminuw %xmm1, %xmm3
+; SSE4-NEXT:    pminuw %xmm5, %xmm1
+; SSE4-NEXT:    pminuw %xmm3, %xmm1
 ; SSE4-NEXT:    pminuw %xmm6, %xmm2
-; SSE4-NEXT:    pminuw %xmm4, %xmm2
-; SSE4-NEXT:    pminuw %xmm3, %xmm2
-; SSE4-NEXT:    pminuw %xmm0, %xmm2
-; SSE4-NEXT:    phminposuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm4, %xmm0
+; SSE4-NEXT:    pminuw %xmm2, %xmm0
+; SSE4-NEXT:    pminuw %xmm1, %xmm0
+; SSE4-NEXT:    phminposuw %xmm0, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE4-NEXT:    retq
@@ -1614,13 +1614,13 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminuw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpminuw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpminuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpminuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpminuw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminuw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1630,7 +1630,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminuw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminuw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
@@ -1887,31 +1887,31 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE2-LABEL: test_v64i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pminub %xmm3, %xmm1
-; SSE2-NEXT:    pminub %xmm2, %xmm1
-; SSE2-NEXT:    pminub %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pminub %xmm2, %xmm0
 ; SSE2-NEXT:    pminub %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE2-NEXT:    pminub %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    pminub %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    pminub %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrlw $8, %xmm0
+; SSE2-NEXT:    pminub %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v64i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminub %xmm3, %xmm1
-; SSE4-NEXT:    pminub %xmm2, %xmm1
-; SSE4-NEXT:    pminub %xmm0, %xmm1
-; SSE4-NEXT:    movdqa %xmm1, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
+; SSE4-NEXT:    pminub %xmm2, %xmm0
 ; SSE4-NEXT:    pminub %xmm1, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE4-NEXT:    retq
@@ -1921,8 +1921,8 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpminub %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -1965,14 +1965,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE2-LABEL: test_v128i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pminub %xmm6, %xmm2
-; SSE2-NEXT:    pminub %xmm7, %xmm3
-; SSE2-NEXT:    pminub %xmm5, %xmm3
-; SSE2-NEXT:    pminub %xmm1, %xmm3
-; SSE2-NEXT:    pminub %xmm4, %xmm2
-; SSE2-NEXT:    pminub %xmm3, %xmm2
-; SSE2-NEXT:    pminub %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    pminub %xmm4, %xmm0
 ; SSE2-NEXT:    pminub %xmm2, %xmm0
+; SSE2-NEXT:    pminub %xmm7, %xmm3
+; SSE2-NEXT:    pminub %xmm5, %xmm1
+; SSE2-NEXT:    pminub %xmm3, %xmm1
+; SSE2-NEXT:    pminub %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pminub %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pminub %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
@@ -1988,16 +1988,16 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE4-LABEL: test_v128i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pminub %xmm7, %xmm3
-; SSE4-NEXT:    pminub %xmm5, %xmm3
-; SSE4-NEXT:    pminub %xmm1, %xmm3
+; SSE4-NEXT:    pminub %xmm5, %xmm1
+; SSE4-NEXT:    pminub %xmm3, %xmm1
 ; SSE4-NEXT:    pminub %xmm6, %xmm2
-; SSE4-NEXT:    pminub %xmm4, %xmm2
-; SSE4-NEXT:    pminub %xmm3, %xmm2
-; SSE4-NEXT:    pminub %xmm0, %xmm2
-; SSE4-NEXT:    movdqa %xmm2, %xmm0
-; SSE4-NEXT:    psrlw $8, %xmm0
+; SSE4-NEXT:    pminub %xmm4, %xmm0
 ; SSE4-NEXT:    pminub %xmm2, %xmm0
-; SSE4-NEXT:    phminposuw %xmm0, %xmm0
+; SSE4-NEXT:    pminub %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, %xmm1
+; SSE4-NEXT:    psrlw $8, %xmm1
+; SSE4-NEXT:    pminub %xmm0, %xmm1
+; SSE4-NEXT:    phminposuw %xmm1, %xmm0
 ; SSE4-NEXT:    movd %xmm0, %eax
 ; SSE4-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE4-NEXT:    retq
@@ -2008,13 +2008,13 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT:    vpminub %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpminub %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpminub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpminub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpminub %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpminub %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphminposuw %xmm0, %xmm0
@@ -2026,7 +2026,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpminub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpminub %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpminub %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 102409209fed0..580fdcddbc3c7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -805,10 +805,10 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; SSE-LABEL: trunc_v64i8_v64i1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    psllw $7, %xmm1
-; SSE-NEXT:    pmovmskb %xmm1, %eax
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    psllw $7, %xmm0
+; SSE-NEXT:    pmovmskb %xmm0, %eax
 ; SSE-NEXT:    xorb %ah, %al
 ; SSE-NEXT:    setnp %al
 ; SSE-NEXT:    retq
@@ -844,8 +844,8 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1737,10 +1737,10 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; SSE-NEXT:    pxor %xmm4, %xmm4
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm1
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    xorb %ah, %al
@@ -1752,13 +1752,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    xorb %ah, %al
 ; AVX1-NEXT:    setnp %al
@@ -1789,8 +1789,8 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2664,10 +2664,10 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpeqb %xmm6, %xmm2
 ; SSE-NEXT:    pcmpeqb %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm7, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm5, %xmm1
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    pmovmskb %xmm1, %eax
 ; SSE-NEXT:    xorb %ah, %al
@@ -2678,6 +2678,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
@@ -2686,7 +2687,6 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX1-NEXT:    xorb %ah, %al
 ; AVX1-NEXT:    setnp %al
@@ -2716,7 +2716,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm1
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; AVX512F-NEXT:    vpxor %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0

diff  --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
index 51fa98be790cf..1e67ba8b6b6a0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll
@@ -74,11 +74,11 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; SSE-LABEL: test_v8i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movq %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i64:
@@ -122,21 +122,21 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; SSE-LABEL: test_v16i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm7, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm2
-; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pxor %xmm4, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
@@ -149,7 +149,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-LABEL: test_v16i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -273,13 +273,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; SSE-LABEL: test_v16i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
@@ -329,14 +329,14 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; SSE-LABEL: test_v32i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm7, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm2
-; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pxor %xmm4, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    movd %xmm1, %eax
@@ -345,7 +345,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
@@ -360,7 +360,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-LABEL: test_v32i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -536,16 +536,16 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE-LABEL: test_v32i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -605,14 +605,14 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE-LABEL: test_v64i16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm7, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm2
-; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pxor %xmm4, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -625,7 +625,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
@@ -643,7 +643,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-LABEL: test_v64i16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -870,19 +870,19 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; SSE-LABEL: test_v64i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm3, %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm1
-; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
 ; SSE-NEXT:    pxor %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrlw $8, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -948,14 +948,14 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; SSE-LABEL: test_v128i8:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm6, %xmm2
-; SSE-NEXT:    pxor %xmm7, %xmm3
-; SSE-NEXT:    pxor %xmm5, %xmm3
-; SSE-NEXT:    pxor %xmm1, %xmm3
-; SSE-NEXT:    pxor %xmm4, %xmm2
-; SSE-NEXT:    pxor %xmm3, %xmm2
-; SSE-NEXT:    pxor %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE-NEXT:    pxor %xmm4, %xmm0
 ; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm5, %xmm1
+; SSE-NEXT:    pxor %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    pxor %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE-NEXT:    pxor %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
@@ -971,7 +971,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-LABEL: test_v128i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
@@ -991,7 +991,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-LABEL: test_v128i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll
index 1b772f97469f6..c5fcacc5f2a42 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-math.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll
@@ -2615,12 +2615,12 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ;
 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
@@ -2726,22 +2726,22 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [255,255,255,255]
-; AVX1-NEXT:    vandps %ymm7, %ymm8, %ymm7
-; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %ymm6, %ymm8, %ymm6
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpackusdw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255]
+; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm3
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %ymm4, %ymm8, %ymm3
-; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
@@ -2751,17 +2751,17 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX2-LABEL: trunc_and_v16i64_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255]
-; AVX2-NEXT:    vpand %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpand %ymm6, %ymm8, %ymm6
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm5, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpand %ymm7, %ymm3, %ymm3
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
 ; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpand %ymm5, %ymm8, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm8, %ymm3
-; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
@@ -2788,28 +2788,28 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_and_v16i32_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT:    pand %xmm8, %xmm7
-; SSE-NEXT:    pand %xmm3, %xmm7
-; SSE-NEXT:    pand %xmm8, %xmm6
-; SSE-NEXT:    pand %xmm2, %xmm6
-; SSE-NEXT:    packuswb %xmm7, %xmm6
-; SSE-NEXT:    pand %xmm8, %xmm5
-; SSE-NEXT:    pand %xmm1, %xmm5
-; SSE-NEXT:    pand %xmm4, %xmm8
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    packuswb %xmm5, %xmm0
-; SSE-NEXT:    packuswb %xmm6, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    packuswb %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
@@ -2819,10 +2819,10 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ;
 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -2845,12 +2845,12 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; SSE-LABEL: trunc_and_v16i16_v16i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT:    pand %xmm4, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_and_v16i16_v16i8:

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index 7e916561826a7..4680e86cf73ad 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -3020,24 +3020,24 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
 ; SSE2-LABEL: trunc_packus_v4i64_v4i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
 ; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
@@ -3049,26 +3049,26 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) {
 ; SSE2-NEXT:    pandn %xmm2, %xmm5
 ; SSE2-NEXT:    por %xmm5, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm6, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm4
 ; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
@@ -3276,24 +3276,24 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; SSE2-LABEL: trunc_packus_v4i64_v4i8_store:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483903,2147483903]
 ; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
 ; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3]
-; SSE2-NEXT:    por %xmm9, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    por %xmm1, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3]
+; SSE2-NEXT:    por %xmm9, %xmm3
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pcmpeqd %xmm6, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
@@ -3305,31 +3305,31 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
 ; SSE2-NEXT:    pandn %xmm2, %xmm1
 ; SSE2-NEXT:    por %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm4
 ; SSE2-NEXT:    pand %xmm2, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    packuswb %xmm3, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm5
-; SSE2-NEXT:    movd %xmm5, (%rdi)
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    packuswb %xmm1, %xmm1
+; SSE2-NEXT:    movd %xmm1, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store:

diff  --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
index 943424171d662..c785db8879d49 100644
--- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
+++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
@@ -46,16 +46,16 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
 ; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
 ; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
 ; X86-NEXT:    kmovw %k0, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    kmovw %k1, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
-; X86-NEXT:    kmovw %k0, %edx
-; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload
+; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload
+; X86-NEXT:    kmovw %k2, %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movw %dx, (%esi)
+; X86-NEXT:    kmovw %k1, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movw %ax, (%esi)
 ; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -104,12 +104,12 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
 ; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; X64-NEXT:    kmovw %k0, %edi
 ; X64-NEXT:    kmovw %k1, %r8d
-; X64-NEXT:    addl %ecx, %edx
-; X64-NEXT:    addl %r8d, %eax
-; X64-NEXT:    addl %esi, %eax
-; X64-NEXT:    addl %edx, %eax
 ; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    movw %ax, (%rbx)
+; X64-NEXT:    addl %ecx, %edx
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %r8d, %edx
+; X64-NEXT:    addl %esi, %edx
+; X64-NEXT:    movw %dx, (%rbx)
 ; X64-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %rbp

diff  --git a/llvm/test/CodeGen/X86/win-smallparams.ll b/llvm/test/CodeGen/X86/win-smallparams.ll
index 0993e20c91d13..5ca8f6705479f 100644
--- a/llvm/test/CodeGen/X86/win-smallparams.ll
+++ b/llvm/test/CodeGen/X86/win-smallparams.ll
@@ -65,57 +65,51 @@ define i32 @manyargs(i8 %a, i16 %b, i8 %c, i16 %d, i8 %e, i16 %f) {
 ; WIN64:       # %bb.0: # %entry
 ; WIN64-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
 ; WIN64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; WIN64-NEXT:    movsbl %cl, %ecx
-; WIN64-NEXT:    movswl %dx, %eax
+; WIN64-NEXT:    movsbl %cl, %eax
+; WIN64-NEXT:    movswl %dx, %ecx
+; WIN64-NEXT:    addl %eax, %ecx
 ; WIN64-NEXT:    movzbl %r8b, %edx
-; WIN64-NEXT:    addl %eax, %edx
 ; WIN64-NEXT:    movzwl %r9w, %eax
 ; WIN64-NEXT:    addl %edx, %eax
+; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    addl %r11d, %eax
 ; WIN64-NEXT:    addl %r10d, %eax
-; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-MSVC-LABEL: manyargs:
 ; WIN32-MSVC:       # %bb.0: # %entry
-; WIN32-MSVC-NEXT:    pushl %edi
 ; WIN32-MSVC-NEXT:    pushl %esi
 ; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; WIN32-MSVC-NEXT:    movswl {{[0-9]+}}(%esp), %edi
-; WIN32-MSVC-NEXT:    addl %esi, %edi
-; WIN32-MSVC-NEXT:    addl %edx, %edi
-; WIN32-MSVC-NEXT:    addl %ecx, %edi
-; WIN32-MSVC-NEXT:    addl %eax, %edi
+; WIN32-MSVC-NEXT:    addl %eax, %ecx
+; WIN32-MSVC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-MSVC-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-MSVC-NEXT:    addl %eax, %edx
+; WIN32-MSVC-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; WIN32-MSVC-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-MSVC-NEXT:    addl %edi, %eax
+; WIN32-MSVC-NEXT:    addl %esi, %eax
+; WIN32-MSVC-NEXT:    addl %edx, %eax
+; WIN32-MSVC-NEXT:    addl %ecx, %eax
 ; WIN32-MSVC-NEXT:    popl %esi
-; WIN32-MSVC-NEXT:    popl %edi
 ; WIN32-MSVC-NEXT:    retl
 ;
 ; WIN32-GNU-LABEL: manyargs:
 ; WIN32-GNU:       # %bb.0: # %entry
-; WIN32-GNU-NEXT:    pushl %edi
-; WIN32-GNU-NEXT:    .cfi_def_cfa_offset 8
 ; WIN32-GNU-NEXT:    pushl %esi
-; WIN32-GNU-NEXT:    .cfi_def_cfa_offset 12
-; WIN32-GNU-NEXT:    .cfi_offset %esi, -12
-; WIN32-GNU-NEXT:    .cfi_offset %edi, -8
+; WIN32-GNU-NEXT:    .cfi_def_cfa_offset 8
+; WIN32-GNU-NEXT:    .cfi_offset %esi, -8
 ; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; WIN32-GNU-NEXT:    movswl {{[0-9]+}}(%esp), %edi
-; WIN32-GNU-NEXT:    addl %esi, %edi
-; WIN32-GNU-NEXT:    addl %edx, %edi
-; WIN32-GNU-NEXT:    addl %ecx, %edi
-; WIN32-GNU-NEXT:    addl %eax, %edi
+; WIN32-GNU-NEXT:    addl %eax, %ecx
+; WIN32-GNU-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; WIN32-GNU-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; WIN32-GNU-NEXT:    addl %eax, %edx
+; WIN32-GNU-NEXT:    movswl {{[0-9]+}}(%esp), %esi
 ; WIN32-GNU-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; WIN32-GNU-NEXT:    addl %edi, %eax
+; WIN32-GNU-NEXT:    addl %esi, %eax
+; WIN32-GNU-NEXT:    addl %edx, %eax
+; WIN32-GNU-NEXT:    addl %ecx, %eax
 ; WIN32-GNU-NEXT:    popl %esi
-; WIN32-GNU-NEXT:    popl %edi
 ; WIN32-GNU-NEXT:    retl
 entry:
   %aa = sext i8 %a to i32

diff  --git a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
index 4f930adae3d18..e5732d76ee8b3 100644
--- a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll
@@ -5,16 +5,16 @@
 define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_sse:
 ; DARWIN:       ## %bb.0:
-; DARWIN-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; DARWIN-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; DARWIN-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; DARWIN-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
 ; DARWIN-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; DARWIN-NEXT:    retl
 ;
 ; LINUX-LABEL: test_sse:
 ; LINUX:       # %bb.0:
 ; LINUX-NEXT:    subl $12, %esp
-; LINUX-NEXT:    vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm2
-; LINUX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; LINUX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; LINUX-NEXT:    vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm1
 ; LINUX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; LINUX-NEXT:    addl $12, %esp
 ; LINUX-NEXT:    retl
@@ -27,8 +27,8 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %
 define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_avx:
 ; DARWIN:       ## %bb.0:
-; DARWIN-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
-; DARWIN-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; DARWIN-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; DARWIN-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
 ; DARWIN-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; DARWIN-NEXT:    retl
 ;
@@ -38,8 +38,8 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
 ; LINUX-NEXT:    movl %esp, %ebp
 ; LINUX-NEXT:    andl $-32, %esp
 ; LINUX-NEXT:    subl $32, %esp
-; LINUX-NEXT:    vpaddd 8(%ebp), %ymm2, %ymm2
-; LINUX-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; LINUX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; LINUX-NEXT:    vpaddd 8(%ebp), %ymm2, %ymm1
 ; LINUX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; LINUX-NEXT:    movl %ebp, %esp
 ; LINUX-NEXT:    popl %ebp
@@ -53,8 +53,8 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
 define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
 ; DARWIN-LABEL: test_avx512:
 ; DARWIN:       ## %bb.0:
-; DARWIN-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
-; DARWIN-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; DARWIN-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; DARWIN-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
 ; DARWIN-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; DARWIN-NEXT:    retl
 ;
@@ -64,8 +64,8 @@ define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16
 ; LINUX-NEXT:    movl %esp, %ebp
 ; LINUX-NEXT:    andl $-64, %esp
 ; LINUX-NEXT:    subl $64, %esp
-; LINUX-NEXT:    vpaddd 8(%ebp), %zmm2, %zmm2
-; LINUX-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; LINUX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; LINUX-NEXT:    vpaddd 8(%ebp), %zmm2, %zmm1
 ; LINUX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; LINUX-NEXT:    movl %ebp, %esp
 ; LINUX-NEXT:    popl %ebp

diff  --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index ef9fe2c4daf49..0ef20f21ac8c0 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -123,14 +123,14 @@ define <4 x i64> @load_factori64_4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -147,7 +147,7 @@ define <4 x i64> @load_factori64_4(ptr %ptr) nounwind {
 ; AVX2OR512-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
 ; AVX2OR512-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2OR512-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX2OR512-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
+; AVX2OR512-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
 ; AVX2OR512-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; AVX2OR512-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
 ; AVX2OR512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
@@ -812,14 +812,14 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
 ; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
@@ -895,10 +895,10 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(ptr %ptr){
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 	%wide.vec = load <24 x i8>, ptr %ptr
@@ -1270,27 +1270,27 @@ ret void
 define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm8
+; AVX1-NEXT:    vmovdqu (%rdi), %xmm9
 ; AVX1-NEXT:    vmovups 16(%rdi), %xmm0
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm10
 ; AVX1-NEXT:    vmovdqu 64(%rdi), %xmm3
 ; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm4
-; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm5
+; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm6
 ; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm2
 ; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm12
 ; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT:    vpshufb %xmm11, %xmm5, %xmm6
+; AVX1-NEXT:    vpshufb %xmm11, %xmm6, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm12, %xmm7
-; AVX1-NEXT:    vpshufb %xmm11, %xmm8, %xmm9
+; AVX1-NEXT:    vpshufb %xmm11, %xmm9, %xmm8
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm10, %xmm11
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm13, %xmm5, %xmm5
+; AVX1-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm15
 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpor %xmm5, %xmm15, %xmm0
+; AVX1-NEXT:    vpor %xmm6, %xmm15, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm13, %xmm12, %xmm12
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm1, %xmm15
@@ -1298,21 +1298,21 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpor %xmm12, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm13, %xmm8, %xmm8
+; AVX1-NEXT:    vpshufb %xmm13, %xmm9, %xmm9
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm1, %xmm15
-; AVX1-NEXT:    vpor %xmm8, %xmm15, %xmm5
-; AVX1-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpshufb %xmm13, %xmm10, %xmm8
+; AVX1-NEXT:    vpor %xmm9, %xmm15, %xmm6
+; AVX1-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT:    vpshufb %xmm13, %xmm10, %xmm9
 ; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm10
-; AVX1-NEXT:    vpor %xmm8, %xmm10, %xmm10
-; AVX1-NEXT:    vpshufb %xmm13, %xmm3, %xmm8
-; AVX1-NEXT:    vpshufb %xmm14, %xmm4, %xmm5
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm8
+; AVX1-NEXT:    vpor %xmm9, %xmm10, %xmm10
+; AVX1-NEXT:    vpshufb %xmm13, %xmm3, %xmm9
+; AVX1-NEXT:    vpshufb %xmm14, %xmm4, %xmm6
+; AVX1-NEXT:    vpor %xmm6, %xmm9, %xmm6
+; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm9
 ; AVX1-NEXT:    vpshufb %xmm13, %xmm1, %xmm3
-; AVX1-NEXT:    vpshufb %xmm14, %xmm8, %xmm12
+; AVX1-NEXT:    vpshufb %xmm14, %xmm9, %xmm12
 ; AVX1-NEXT:    vpor %xmm3, %xmm12, %xmm3
 ; AVX1-NEXT:    vmovdqu 176(%rdi), %xmm12
 ; AVX1-NEXT:    vpshufb %xmm13, %xmm0, %xmm1
@@ -1324,59 +1324,59 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-NEXT:    vpor %xmm13, %xmm14, %xmm14
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm13
-; AVX1-NEXT:    vpor %xmm6, %xmm13, %xmm13
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpor %xmm5, %xmm13, %xmm13
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm14
 ; AVX1-NEXT:    vpor %xmm7, %xmm14, %xmm14
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpshufb %xmm0, %xmm8, %xmm7
-; AVX1-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm9[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vpshufb %xmm0, %xmm9, %xmm7
+; AVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vpor %xmm0, %xmm11, %xmm0
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
-; AVX1-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
+; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm11[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm10, %xmm4
-; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT:    vpshufb %xmm11, %xmm8, %xmm8
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm8, %xmm3, %xmm10
+; AVX1-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
+; AVX1-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT:    vpshufb %xmm11, %xmm12, %xmm8
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm8, %xmm3, %xmm10
+; AVX1-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
+; AVX1-NEXT:    vpor %xmm12, %xmm10, %xmm10
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm9, %xmm3, %xmm5
-; AVX1-NEXT:    vpshufb %xmm11, %xmm15, %xmm8
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-NEXT:    vpaddb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT:    vpshufb %xmm11, %xmm15, %xmm11
+; AVX1-NEXT:    vpor %xmm11, %xmm8, %xmm8
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpshufb %xmm6, %xmm7, %xmm3
+; AVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpshufb %xmm11, %xmm7, %xmm3
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm9, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm6, %xmm14, %xmm3
+; AVX1-NEXT:    vpshufb %xmm11, %xmm14, %xmm3
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm10, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm6, %xmm13, %xmm3
+; AVX1-NEXT:    vpshufb %xmm11, %xmm13, %xmm3
 ; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
+; AVX1-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
 ; AVX1-NEXT:    vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddb %xmm3, %xmm8, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1

diff  --git a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 263ff329163c5..77e41cbd8ce18 100644
--- a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -38,10 +38,10 @@ define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
 ; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    movl %edi, %edx
 ; CHECK-NEXT:    callq bar at PLT
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    addl %eax, %edx
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    cvtsi2ss %eax, %xmm0
+; CHECK-NEXT:    cvtsi2ss %edx, %xmm0
 ; CHECK-NEXT:    addss %xmm1, %xmm0
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index cbbe089c80192..2517530ca28ec 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -212,66 +212,68 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    imull %ecx, %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    imull %ebp, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %eax, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %ecx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    addl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    addl %eax, %esi
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    adcl %ebp, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ebx, %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
 ; WIN32-NEXT:    adcl $0, %edi
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %edi, %ebx
+; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebx, 4(%eax)
+; WIN32-NEXT:    movl %ebp, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -469,30 +471,28 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %dl, %bl
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    seto %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bh, %ch
+; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb %cl, %ch
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    orb %bl, %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
@@ -571,64 +571,66 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edx
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    imull %edx, %ecx
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %eax, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    imull %ecx, %edi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    movl %eax, %ecx
 ; WIN32-NEXT:    addl %eax, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    addl %eax, %ebx
+; WIN32-NEXT:    addl %ecx, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebp, %ecx
-; WIN32-NEXT:    adcl $0, %ebx
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    adcl %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    adcl %ecx, %ebp
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    xorl %ebp, %edx
-; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    xorl %esi, %edx
+; WIN32-NEXT:    xorl %eax, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    orl %edx, %ebp
+; WIN32-NEXT:    orl %edx, %esi
 ; WIN32-NEXT:    jne LBB12_2
 ; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:  LBB12_2:
-; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    movl %edi, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -723,13 +725,13 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    seto %bh
 ; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    orb %bl, %bh
 ; WIN32-NEXT:    addl %eax, %edi
 ; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
 ; WIN32-NEXT:    orb %bh, %al
-; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    testb %al, %al
 ; WIN32-NEXT:    jne LBB14_2
 ; WIN32-NEXT:  # %bb.1:
@@ -989,63 +991,65 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
+; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %eax, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %eax, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    adcl $0, %esi
-; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    addl %eax, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ebp, %edi
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    addl %eax, %esi
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    adcl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ebp, %edi
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %ecx, %ebx
+; WIN32-NEXT:    adcl %ebp, %edi
+; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    xorl %edi, %edx
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    orl %edx, %edi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    xorl %ebx, %edx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    orl %edx, %ebx
 ; WIN32-NEXT:    jne LBB18_1
 ; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
 ; WIN32-NEXT:  LBB18_2: # %overflow
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1313,30 +1317,28 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %dl, %bl
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    seto %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    seto %cl
-; WIN32-NEXT:    orb %bh, %cl
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %cl, %al
-; WIN32-NEXT:    orb %bl, %al
+; WIN32-NEXT:    orb %ch, %al
 ; WIN32-NEXT:    subb $1, %al
 ; WIN32-NEXT:    je LBB22_1
 ; WIN32-NEXT:  # %bb.3: # %continue
@@ -1694,68 +1696,72 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %edx
-; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl 4(%eax), %esi
+; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull %esi, %ecx
 ; WIN32-NEXT:    mull %edx
 ; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %eax, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    addl %ecx, %ebx
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    addl %eax, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl %eax, %edi
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    addl %ebp, %ebx
 ; WIN32-NEXT:    addl %eax, %ebp
 ; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebx, %edi
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
+; WIN32-NEXT:    adcl $0, %ebx
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    adcl %ebp, %esi
-; WIN32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %ebx, %esi
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movzbl (%esp), %esi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebx, 4(%eax)
+; WIN32-NEXT:    movl %ebp, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1802,59 +1808,61 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebx
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl (%eax), %ebp
+; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    imull %ecx, %edi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    addl %eax, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    addl %ebx, %esi
+; WIN32-NEXT:    addl %eax, %esi
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %ecx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    adcl %ecx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl $0, %edi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %edi, %ebx
+; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    mull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebx, 4(%eax)
+; WIN32-NEXT:    movl %ebp, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
@@ -2213,36 +2221,34 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ecx
+; WIN32-NEXT:    movl (%eax), %esi
 ; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %dl, %cl
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    orb %ch, %bl
+; WIN32-NEXT:    orb %cl, %bl
+; WIN32-NEXT:    leal (%edi,%eax), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    addl %ecx, %edx
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %bh, %cl
 ; WIN32-NEXT:    orb %bl, %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2293,36 +2299,33 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl (%edx), %ecx
+; WIN32-NEXT:    movl (%edx), %ebp
 ; WIN32-NEXT:    movl 4(%edx), %esi
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %dl, %bl
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    seto %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb %cl, %ch
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %bh, %cl
-; WIN32-NEXT:    orb %bl, %cl
+; WIN32-NEXT:    orb %ch, %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx


        


More information about the llvm-commits mailing list