[llvm] d24c93c - [X86] Enable reassociation for ADD instructions

Guozhi Wei via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 25 17:48:35 PDT 2022


Author: Guozhi Wei
Date: 2022-10-26T00:46:13Z
New Revision: d24c93cc4107dca68a2760199e970cb04cdeed90

URL: https://github.com/llvm/llvm-project/commit/d24c93cc4107dca68a2760199e970cb04cdeed90
DIFF: https://github.com/llvm/llvm-project/commit/d24c93cc4107dca68a2760199e970cb04cdeed90.diff

LOG: [X86] Enable reassociation for ADD instructions

ADD is an associative and commutative operation, so we can do reassociation for it.

Differential Revision: https://reviews.llvm.org/D136396

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86InstrInfo.cpp
    llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
    llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
    llvm/test/CodeGen/X86/add-sub-bool.ll
    llvm/test/CodeGen/X86/alias-static-alloca.ll
    llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
    llvm/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
    llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
    llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/divmod128.ll
    llvm/test/CodeGen/X86/fold-add.ll
    llvm/test/CodeGen/X86/fold-tied-op.ll
    llvm/test/CodeGen/X86/h-registers-1.ll
    llvm/test/CodeGen/X86/hipe-cc.ll
    llvm/test/CodeGen/X86/hipe-cc64.ll
    llvm/test/CodeGen/X86/imul.ll
    llvm/test/CodeGen/X86/lea-opt-cse4.ll
    llvm/test/CodeGen/X86/lea-opt2.ll
    llvm/test/CodeGen/X86/lrshrink.ll
    llvm/test/CodeGen/X86/midpoint-int.ll
    llvm/test/CodeGen/X86/misched-balance.ll
    llvm/test/CodeGen/X86/misched-matrix.ll
    llvm/test/CodeGen/X86/mul-constant-i16.ll
    llvm/test/CodeGen/X86/mul-constant-i32.ll
    llvm/test/CodeGen/X86/mul-constant-i64.ll
    llvm/test/CodeGen/X86/mul-i1024.ll
    llvm/test/CodeGen/X86/mul-i256.ll
    llvm/test/CodeGen/X86/mul-i512.ll
    llvm/test/CodeGen/X86/mul128.ll
    llvm/test/CodeGen/X86/mul64.ll
    llvm/test/CodeGen/X86/muloti.ll
    llvm/test/CodeGen/X86/popcnt.ll
    llvm/test/CodeGen/X86/pr34080-2.ll
    llvm/test/CodeGen/X86/pr36865.ll
    llvm/test/CodeGen/X86/reassociate-add.ll
    llvm/test/CodeGen/X86/smul-with-overflow.ll
    llvm/test/CodeGen/X86/smul_fix.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/sse-regcall.ll
    llvm/test/CodeGen/X86/stack-clash-large.ll
    llvm/test/CodeGen/X86/statepoint-live-in.ll
    llvm/test/CodeGen/X86/statepoint-regs.ll
    llvm/test/CodeGen/X86/swift-return.ll
    llvm/test/CodeGen/X86/twoaddr-lea.ll
    llvm/test/CodeGen/X86/umul-with-overflow.ll
    llvm/test/CodeGen/X86/umul_fix.ll
    llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
    llvm/test/CodeGen/X86/win-smallparams.ll
    llvm/test/CodeGen/X86/x86-cmov-converter.ll
    llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b1edbf7d179db..e57d0ffbc9409 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8709,6 +8709,10 @@ bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
 //       3. Other forms of the same operation (intrinsics and other variants)
 bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   switch (Inst.getOpcode()) {
+  case X86::ADD8rr:
+  case X86::ADD16rr:
+  case X86::ADD32rr:
+  case X86::ADD64rr:
   case X86::AND8rr:
   case X86::AND16rr:
   case X86::AND32rr:

diff  --git a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
index d2cc1b3599cbc..5c78092e9f2c4 100644
--- a/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
+++ b/llvm/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
@@ -37,190 +37,190 @@ define fastcc i64 @foo() nounwind {
 ; CHECK-NEXT:    addq %rbx, %r11
 ; CHECK-NEXT:    leaq (%rsi,%rdx), %rbx
 ; CHECK-NEXT:    addq %rdi, %rbx
+; CHECK-NEXT:    addq %rdi, %r11
 ; CHECK-NEXT:    addq %rbx, %r11
 ; CHECK-NEXT:    addq %rax, %rax
 ; CHECK-NEXT:    addq %r10, %rax
-; CHECK-NEXT:    movq X(%rip), %rbx
-; CHECK-NEXT:    addq %rdi, %r11
 ; CHECK-NEXT:    addq %r11, %r8
 ; CHECK-NEXT:    addq %r10, %rax
 ; CHECK-NEXT:    addq %r11, %rax
-; CHECK-NEXT:    bswapq %rbx
-; CHECK-NEXT:    leaq (%rdi,%rsi), %r11
+; CHECK-NEXT:    movq X(%rip), %r11
+; CHECK-NEXT:    bswapq %r11
+; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    leaq (%rdi,%rsi), %rdx
+; CHECK-NEXT:    addq %r8, %rdx
 ; CHECK-NEXT:    addq %r8, %r11
-; CHECK-NEXT:    addq %rdx, %rbx
-; CHECK-NEXT:    addq %r11, %rbx
+; CHECK-NEXT:    addq %rdx, %r11
 ; CHECK-NEXT:    leaq (%r10,%rcx), %rdx
 ; CHECK-NEXT:    addq %rdx, %rdx
 ; CHECK-NEXT:    addq %rax, %rdx
-; CHECK-NEXT:    movq X(%rip), %r11
-; CHECK-NEXT:    addq %r8, %rbx
-; CHECK-NEXT:    addq %rbx, %r9
+; CHECK-NEXT:    addq %r11, %r9
 ; CHECK-NEXT:    addq %rax, %rdx
-; CHECK-NEXT:    addq %rbx, %rdx
+; CHECK-NEXT:    addq %r11, %rdx
+; CHECK-NEXT:    movq X(%rip), %r11
 ; CHECK-NEXT:    bswapq %r11
-; CHECK-NEXT:    leaq (%r8,%rdi), %rbx
-; CHECK-NEXT:    addq %r9, %rbx
 ; CHECK-NEXT:    addq %rsi, %r11
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    leaq (%r8,%rdi), %rsi
+; CHECK-NEXT:    addq %r9, %rsi
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    addq %rsi, %r11
 ; CHECK-NEXT:    leaq (%rax,%r10), %rsi
 ; CHECK-NEXT:    addq %rsi, %rsi
 ; CHECK-NEXT:    addq %rdx, %rsi
-; CHECK-NEXT:    movq X(%rip), %rbx
-; CHECK-NEXT:    addq %r9, %r11
 ; CHECK-NEXT:    addq %r11, %rcx
 ; CHECK-NEXT:    addq %rdx, %rsi
 ; CHECK-NEXT:    addq %r11, %rsi
-; CHECK-NEXT:    bswapq %rbx
-; CHECK-NEXT:    leaq (%r9,%r8), %r11
+; CHECK-NEXT:    movq X(%rip), %r11
+; CHECK-NEXT:    bswapq %r11
+; CHECK-NEXT:    addq %rdi, %r11
+; CHECK-NEXT:    leaq (%r9,%r8), %rdi
+; CHECK-NEXT:    addq %rcx, %rdi
 ; CHECK-NEXT:    addq %rcx, %r11
-; CHECK-NEXT:    addq %rdi, %rbx
-; CHECK-NEXT:    addq %r11, %rbx
+; CHECK-NEXT:    addq %rdi, %r11
 ; CHECK-NEXT:    leaq (%rdx,%rax), %rdi
 ; CHECK-NEXT:    addq %rdi, %rdi
 ; CHECK-NEXT:    addq %rsi, %rdi
-; CHECK-NEXT:    movq X(%rip), %r11
-; CHECK-NEXT:    addq %rcx, %rbx
-; CHECK-NEXT:    addq %rbx, %r10
+; CHECK-NEXT:    addq %r11, %r10
 ; CHECK-NEXT:    addq %rsi, %rdi
-; CHECK-NEXT:    addq %rbx, %rdi
+; CHECK-NEXT:    addq %r11, %rdi
+; CHECK-NEXT:    movq X(%rip), %r11
 ; CHECK-NEXT:    bswapq %r11
-; CHECK-NEXT:    leaq (%rcx,%r9), %rbx
-; CHECK-NEXT:    addq %r10, %rbx
 ; CHECK-NEXT:    addq %r8, %r11
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    leaq (%rcx,%r9), %r8
+; CHECK-NEXT:    addq %r10, %r8
+; CHECK-NEXT:    addq %r10, %r11
+; CHECK-NEXT:    addq %r8, %r11
 ; CHECK-NEXT:    leaq (%rsi,%rdx), %r8
 ; CHECK-NEXT:    addq %r8, %r8
 ; CHECK-NEXT:    addq %rdi, %r8
-; CHECK-NEXT:    movq X(%rip), %rbx
-; CHECK-NEXT:    addq %r10, %r11
 ; CHECK-NEXT:    addq %r11, %rax
 ; CHECK-NEXT:    addq %rdi, %r8
 ; CHECK-NEXT:    addq %r11, %r8
-; CHECK-NEXT:    bswapq %rbx
-; CHECK-NEXT:    leaq (%r10,%rcx), %r11
+; CHECK-NEXT:    movq X(%rip), %r11
+; CHECK-NEXT:    bswapq %r11
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    leaq (%r10,%rcx), %r9
+; CHECK-NEXT:    addq %rax, %r9
 ; CHECK-NEXT:    addq %rax, %r11
-; CHECK-NEXT:    addq %r9, %rbx
-; CHECK-NEXT:    addq %r11, %rbx
+; CHECK-NEXT:    addq %r9, %r11
 ; CHECK-NEXT:    leaq (%rdi,%rsi), %r9
 ; CHECK-NEXT:    addq %r9, %r9
 ; CHECK-NEXT:    addq %r8, %r9
-; CHECK-NEXT:    movq X(%rip), %r11
-; CHECK-NEXT:    addq %rax, %rbx
-; CHECK-NEXT:    addq %rbx, %rdx
+; CHECK-NEXT:    addq %r11, %rdx
 ; CHECK-NEXT:    addq %r8, %r9
-; CHECK-NEXT:    addq %rbx, %r9
+; CHECK-NEXT:    addq %r11, %r9
+; CHECK-NEXT:    movq X(%rip), %r11
 ; CHECK-NEXT:    bswapq %r11
-; CHECK-NEXT:    leaq (%rax,%r10), %rbx
-; CHECK-NEXT:    addq %rdx, %rbx
 ; CHECK-NEXT:    addq %rcx, %r11
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    leaq (%rax,%r10), %rcx
+; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    addq %rcx, %r11
 ; CHECK-NEXT:    leaq (%r8,%rdi), %rcx
 ; CHECK-NEXT:    addq %rcx, %rcx
 ; CHECK-NEXT:    addq %r9, %rcx
-; CHECK-NEXT:    movq X(%rip), %rbx
-; CHECK-NEXT:    addq %rdx, %r11
 ; CHECK-NEXT:    addq %r11, %rsi
 ; CHECK-NEXT:    addq %r9, %rcx
 ; CHECK-NEXT:    addq %r11, %rcx
-; CHECK-NEXT:    bswapq %rbx
-; CHECK-NEXT:    leaq (%rdx,%rax), %r11
+; CHECK-NEXT:    movq X(%rip), %r11
+; CHECK-NEXT:    bswapq %r11
+; CHECK-NEXT:    addq %r10, %r11
+; CHECK-NEXT:    leaq (%rdx,%rax), %r10
+; CHECK-NEXT:    addq %rsi, %r10
 ; CHECK-NEXT:    addq %rsi, %r11
-; CHECK-NEXT:    addq %r10, %rbx
-; CHECK-NEXT:    addq %r11, %rbx
+; CHECK-NEXT:    addq %r10, %r11
 ; CHECK-NEXT:    leaq (%r9,%r8), %r10
 ; CHECK-NEXT:    addq %r10, %r10
 ; CHECK-NEXT:    addq %rcx, %r10
-; CHECK-NEXT:    movq X(%rip), %r14
-; CHECK-NEXT:    addq %rsi, %rbx
-; CHECK-NEXT:    addq %rbx, %rdi
+; CHECK-NEXT:    addq %r11, %rdi
 ; CHECK-NEXT:    addq %rcx, %r10
-; CHECK-NEXT:    addq %rbx, %r10
-; CHECK-NEXT:    bswapq %r14
-; CHECK-NEXT:    leaq (%rsi,%rdx), %r11
-; CHECK-NEXT:    addq %rdi, %r11
-; CHECK-NEXT:    addq %rax, %r14
-; CHECK-NEXT:    addq %r11, %r14
+; CHECK-NEXT:    addq %r11, %r10
+; CHECK-NEXT:    movq X(%rip), %rbx
+; CHECK-NEXT:    bswapq %rbx
+; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    leaq (%rsi,%rdx), %rax
+; CHECK-NEXT:    addq %rdi, %rax
+; CHECK-NEXT:    addq %rdi, %rbx
+; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    leaq (%rcx,%r9), %r11
 ; CHECK-NEXT:    addq %r11, %r11
 ; CHECK-NEXT:    addq %r10, %r11
-; CHECK-NEXT:    movq X(%rip), %rax
-; CHECK-NEXT:    addq %rdi, %r14
-; CHECK-NEXT:    addq %r14, %r8
+; CHECK-NEXT:    addq %rbx, %r8
 ; CHECK-NEXT:    addq %r10, %r11
-; CHECK-NEXT:    addq %r14, %r11
+; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    movq X(%rip), %rax
 ; CHECK-NEXT:    bswapq %rax
-; CHECK-NEXT:    leaq (%rdi,%rsi), %rbx
-; CHECK-NEXT:    addq %r8, %rbx
 ; CHECK-NEXT:    addq %rdx, %rax
-; CHECK-NEXT:    addq %rbx, %rax
+; CHECK-NEXT:    leaq (%rdi,%rsi), %rdx
+; CHECK-NEXT:    addq %r8, %rdx
+; CHECK-NEXT:    addq %r8, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    leaq (%r10,%rcx), %rdx
 ; CHECK-NEXT:    addq %rdx, %rdx
 ; CHECK-NEXT:    addq %r11, %rdx
-; CHECK-NEXT:    movq X(%rip), %rbx
-; CHECK-NEXT:    addq %r8, %rax
 ; CHECK-NEXT:    addq %rax, %r9
 ; CHECK-NEXT:    addq %r11, %rdx
 ; CHECK-NEXT:    addq %rax, %rdx
+; CHECK-NEXT:    movq X(%rip), %rbx
 ; CHECK-NEXT:    bswapq %rbx
+; CHECK-NEXT:    addq %rsi, %rbx
 ; CHECK-NEXT:    leaq (%r8,%rdi), %rax
 ; CHECK-NEXT:    addq %r9, %rax
-; CHECK-NEXT:    addq %rsi, %rbx
+; CHECK-NEXT:    addq %r9, %rbx
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    leaq (%r11,%r10), %rax
 ; CHECK-NEXT:    addq %rax, %rax
 ; CHECK-NEXT:    addq %rdx, %rax
-; CHECK-NEXT:    movq X(%rip), %r14
-; CHECK-NEXT:    addq %r9, %rbx
 ; CHECK-NEXT:    addq %rbx, %rcx
 ; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rbx, %rax
-; CHECK-NEXT:    bswapq %r14
+; CHECK-NEXT:    movq X(%rip), %rbx
+; CHECK-NEXT:    bswapq %rbx
+; CHECK-NEXT:    addq %rdi, %rbx
 ; CHECK-NEXT:    leaq (%r9,%r8), %rsi
 ; CHECK-NEXT:    addq %rcx, %rsi
-; CHECK-NEXT:    addq %rdi, %r14
-; CHECK-NEXT:    addq %rsi, %r14
+; CHECK-NEXT:    addq %rcx, %rbx
+; CHECK-NEXT:    addq %rsi, %rbx
 ; CHECK-NEXT:    leaq (%rdx,%r11), %rsi
 ; CHECK-NEXT:    addq %rsi, %rsi
 ; CHECK-NEXT:    addq %rax, %rsi
-; CHECK-NEXT:    movq X(%rip), %rdi
-; CHECK-NEXT:    addq %rcx, %r14
-; CHECK-NEXT:    addq %r14, %r10
+; CHECK-NEXT:    addq %rbx, %r10
 ; CHECK-NEXT:    addq %rax, %rsi
-; CHECK-NEXT:    addq %r14, %rsi
-; CHECK-NEXT:    bswapq %rdi
-; CHECK-NEXT:    leaq (%rcx,%r9), %rbx
+; CHECK-NEXT:    addq %rbx, %rsi
+; CHECK-NEXT:    movq X(%rip), %rbx
+; CHECK-NEXT:    bswapq %rbx
+; CHECK-NEXT:    addq %r8, %rbx
+; CHECK-NEXT:    leaq (%rcx,%r9), %rdi
+; CHECK-NEXT:    addq %r10, %rdi
 ; CHECK-NEXT:    addq %r10, %rbx
-; CHECK-NEXT:    addq %r8, %rdi
+; CHECK-NEXT:    addq %rdi, %rbx
+; CHECK-NEXT:    leaq (%rax,%rdx), %rdi
+; CHECK-NEXT:    addq %rdi, %rdi
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    addq %rsi, %rdi
 ; CHECK-NEXT:    addq %rbx, %rdi
-; CHECK-NEXT:    leaq (%rax,%rdx), %r8
-; CHECK-NEXT:    addq %r8, %r8
-; CHECK-NEXT:    addq %rsi, %r8
-; CHECK-NEXT:    addq %r10, %rdi
-; CHECK-NEXT:    addq %rdi, %r11
-; CHECK-NEXT:    addq %rsi, %r8
-; CHECK-NEXT:    addq %rdi, %r8
-; CHECK-NEXT:    movq X(%rip), %rdi
-; CHECK-NEXT:    bswapq %rdi
-; CHECK-NEXT:    addq %r9, %rdi
+; CHECK-NEXT:    movq X(%rip), %r8
+; CHECK-NEXT:    bswapq %r8
+; CHECK-NEXT:    addq %r9, %r8
 ; CHECK-NEXT:    leaq (%r10,%rcx), %r9
 ; CHECK-NEXT:    addq %r11, %r9
-; CHECK-NEXT:    addq %r9, %rdi
+; CHECK-NEXT:    addq %r11, %r8
+; CHECK-NEXT:    addq %r9, %r8
 ; CHECK-NEXT:    addq %rax, %rsi
 ; CHECK-NEXT:    addq %rsi, %rsi
-; CHECK-NEXT:    addq %r8, %rsi
-; CHECK-NEXT:    addq %r8, %rsi
-; CHECK-NEXT:    addq %r11, %rdi
-; CHECK-NEXT:    addq %rdi, %rdx
 ; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    addq %r8, %rdx
+; CHECK-NEXT:    addq %r8, %rsi
 ; CHECK-NEXT:    movq X(%rip), %rax
 ; CHECK-NEXT:    bswapq %rax
 ; CHECK-NEXT:    addq %r10, %r11
 ; CHECK-NEXT:    movq %rax, X(%rip)
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    addq %rdx, %r11
-; CHECK-NEXT:    addq %r11, %rax
 ; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    addq %r11, %rax
 ; CHECK-NEXT:    addq %rsi, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14

diff  --git a/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
index 38b7a2cdda64e..261e8f873b2fe 100644
--- a/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
+++ b/llvm/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
@@ -28,8 +28,8 @@ define cc 11 i32 @caller(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; CHECK-NEXT:    movl %ebp, %esi
 ; CHECK-NEXT:    calll callee at PLT
-; CHECK-NEXT:    addl %eax, %ebx
 ; CHECK-NEXT:    addl %ebp, %ebx
+; CHECK-NEXT:    addl %eax, %ebx
 ; CHECK-NEXT:    movl %ebx, %esi
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index 17eda59660193..30e45a4583624 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -68,11 +68,11 @@ define i32 @test_i32_add_add_idx0(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; X64-LABEL: test_i32_add_add_idx0:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal (%rdi,%rsi), %eax
 ; X64-NEXT:    andl $1, %edx
-; X64-NEXT:    addl %edx, %eax
+; X64-NEXT:    leal (%rdx,%rdi), %eax
+; X64-NEXT:    addl %esi, %eax
 ; X64-NEXT:    retq
   %add = add i32 %y, %x
   %mask = and i32 %z, 1

diff  --git a/llvm/test/CodeGen/X86/alias-static-alloca.ll b/llvm/test/CodeGen/X86/alias-static-alloca.ll
index f8d4ccce89f96..5df2906d512a4 100644
--- a/llvm/test/CodeGen/X86/alias-static-alloca.ll
+++ b/llvm/test/CodeGen/X86/alias-static-alloca.ll
@@ -7,15 +7,15 @@
 define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    leal (%rdi,%rsi), %eax
-; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    leal (%rsi,%rdx), %eax
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    retq
 entry:
   %a0 = alloca i32

diff  --git a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
index d5d10565f24b8..82aff699cf297 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll
@@ -117,10 +117,10 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -136,10 +136,10 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rcx
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -155,10 +155,10 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %rax
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 
@@ -174,10 +174,10 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %rax
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rcx
-; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    vcvtss2si %xmm0, %rcx
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %rdx
 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rdx, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
 

diff  --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 8e0943a10537b..58d4e744c8e5b 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -934,10 +934,10 @@ declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtsd2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvtsd2usi {rz-sae}, %xmm0, %edx
 ; CHECK-NEXT:    vcvtsd2usi {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -953,10 +953,10 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsd2si %xmm0, %eax
-; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtsd2si %xmm0, %ecx
+; CHECK-NEXT:    vcvtsd2si {rz-sae}, %xmm0, %edx
 ; CHECK-NEXT:    vcvtsd2si {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -972,10 +972,10 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2usi %xmm0, %eax
-; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtss2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvtss2usi {rz-sae}, %xmm0, %edx
 ; CHECK-NEXT:    vcvtss2usi {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 
@@ -991,10 +991,10 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtss2si %xmm0, %eax
-; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    vcvtss2si %xmm0, %ecx
+; CHECK-NEXT:    vcvtss2si {rz-sae}, %xmm0, %edx
 ; CHECK-NEXT:    vcvtss2si {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ret{{[l|q]}}
 

diff  --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index ffe5ffb64af14..412c30906cb5c 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -35,32 +35,32 @@ define dso_local x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1,
 ;
 ; WIN64-LABEL: test_argv64i1:
 ; WIN64:       # %bb.0:
+; WIN64-NEXT:    addq %rdx, %rcx
+; WIN64-NEXT:    addq %rdi, %rcx
+; WIN64-NEXT:    addq %rsi, %rcx
+; WIN64-NEXT:    addq %r8, %rcx
+; WIN64-NEXT:    addq %r9, %rcx
+; WIN64-NEXT:    addq %r10, %rcx
+; WIN64-NEXT:    addq %r11, %rcx
+; WIN64-NEXT:    addq %r12, %rcx
+; WIN64-NEXT:    addq %r14, %rcx
+; WIN64-NEXT:    addq %r15, %rcx
 ; WIN64-NEXT:    addq %rcx, %rax
-; WIN64-NEXT:    addq %rdx, %rax
-; WIN64-NEXT:    addq %rdi, %rax
-; WIN64-NEXT:    addq %rsi, %rax
-; WIN64-NEXT:    addq %r8, %rax
-; WIN64-NEXT:    addq %r9, %rax
-; WIN64-NEXT:    addq %r10, %rax
-; WIN64-NEXT:    addq %r11, %rax
-; WIN64-NEXT:    addq %r12, %rax
-; WIN64-NEXT:    addq %r14, %rax
-; WIN64-NEXT:    addq %r15, %rax
 ; WIN64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    retq
 ;
 ; LINUXOSX64-LABEL: test_argv64i1:
 ; LINUXOSX64:       # %bb.0:
+; LINUXOSX64-NEXT:    addq %rdx, %rcx
+; LINUXOSX64-NEXT:    addq %rdi, %rcx
+; LINUXOSX64-NEXT:    addq %rsi, %rcx
+; LINUXOSX64-NEXT:    addq %r8, %rcx
+; LINUXOSX64-NEXT:    addq %r9, %rcx
+; LINUXOSX64-NEXT:    addq %r12, %rcx
+; LINUXOSX64-NEXT:    addq %r13, %rcx
+; LINUXOSX64-NEXT:    addq %r14, %rcx
+; LINUXOSX64-NEXT:    addq %r15, %rcx
 ; LINUXOSX64-NEXT:    addq %rcx, %rax
-; LINUXOSX64-NEXT:    addq %rdx, %rax
-; LINUXOSX64-NEXT:    addq %rdi, %rax
-; LINUXOSX64-NEXT:    addq %rsi, %rax
-; LINUXOSX64-NEXT:    addq %r8, %rax
-; LINUXOSX64-NEXT:    addq %r9, %rax
-; LINUXOSX64-NEXT:    addq %r12, %rax
-; LINUXOSX64-NEXT:    addq %r13, %rax
-; LINUXOSX64-NEXT:    addq %r14, %rax
-; LINUXOSX64-NEXT:    addq %r15, %rax
 ; LINUXOSX64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
 ; LINUXOSX64-NEXT:    addq {{[0-9]+}}(%rsp), %rax
 ; LINUXOSX64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 0caa8826e75c8..bb762469571d1 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -939,7 +939,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    movl %edx, %ebx
@@ -950,36 +950,37 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; X32-NEXT:    subl %esi, %ebx
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    subl %ecx, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    imull %ebx, %eax
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %esi, %edx
+; X32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    imull %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    subl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %eax, %ecx
+; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    imull %ebp, %edi
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    imull %edx, %edi
 ; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    imull %eax, %edx
-; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    imull %ebp, %eax
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    addl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    addl $16, %esp
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
@@ -1013,18 +1014,18 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; WIN64-NEXT:    # kill: def $r11d killed $r11d killed $r11
 ; WIN64-NEXT:    subl %r12d, %r11d
 ; WIN64-NEXT:    imull %edx, %r11d
-; WIN64-NEXT:    addl %r9d, %r11d
 ; WIN64-NEXT:    leal (%r14,%r15), %edx
-; WIN64-NEXT:    movl %r14d, %r9d
-; WIN64-NEXT:    subl %r15d, %r9d
-; WIN64-NEXT:    imull %esi, %r9d
-; WIN64-NEXT:    addl %r11d, %r9d
+; WIN64-NEXT:    # kill: def $r14d killed $r14d killed $r14
+; WIN64-NEXT:    subl %r15d, %r14d
+; WIN64-NEXT:    imull %esi, %r14d
+; WIN64-NEXT:    addl %r11d, %r14d
 ; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    imull %r8d, %eax
 ; WIN64-NEXT:    imull %ebx, %r10d
-; WIN64-NEXT:    addl %r10d, %eax
 ; WIN64-NEXT:    imull %edi, %edx
+; WIN64-NEXT:    addl %r10d, %edx
 ; WIN64-NEXT:    addl %edx, %eax
+; WIN64-NEXT:    addl %r14d, %eax
 ; WIN64-NEXT:    addl %r9d, %eax
 ; WIN64-NEXT:    popq %rbx
 ; WIN64-NEXT:    retq
@@ -1054,19 +1055,19 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
 ; LINUXOSX64-NEXT:    leal (%r13,%r14), %r11d
 ; LINUXOSX64-NEXT:    movl %r13d, %r12d
 ; LINUXOSX64-NEXT:    subl %r14d, %r12d
+; LINUXOSX64-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; LINUXOSX64-NEXT:    imull %edx, %r12d
-; LINUXOSX64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; LINUXOSX64-NEXT:    addl %r9d, %r12d
-; LINUXOSX64-NEXT:    movl %r15d, %r9d
-; LINUXOSX64-NEXT:    subl %edx, %r9d
-; LINUXOSX64-NEXT:    imull %esi, %r9d
-; LINUXOSX64-NEXT:    addl %r12d, %r9d
+; LINUXOSX64-NEXT:    movl %r15d, %edx
+; LINUXOSX64-NEXT:    subl %r14d, %edx
+; LINUXOSX64-NEXT:    imull %esi, %edx
+; LINUXOSX64-NEXT:    addl %r12d, %edx
 ; LINUXOSX64-NEXT:    addl %ecx, %eax
 ; LINUXOSX64-NEXT:    imull %r8d, %eax
 ; LINUXOSX64-NEXT:    imull %r10d, %r11d
-; LINUXOSX64-NEXT:    addl %r11d, %eax
-; LINUXOSX64-NEXT:    addl %r15d, %edx
-; LINUXOSX64-NEXT:    imull %edi, %edx
+; LINUXOSX64-NEXT:    addl %r15d, %r14d
+; LINUXOSX64-NEXT:    imull %edi, %r14d
+; LINUXOSX64-NEXT:    addl %r11d, %r14d
+; LINUXOSX64-NEXT:    addl %r14d, %eax
 ; LINUXOSX64-NEXT:    addl %edx, %eax
 ; LINUXOSX64-NEXT:    addl %r9d, %eax
 ; LINUXOSX64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 4b39dbd297558..b3148629dd554 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1905,19 +1905,19 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
 ; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -1987,23 +1987,23 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
-; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2078,19 +2078,19 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
 ; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    addq %rdx, %rcx # encoding: [0x48,0x01,0xd1]
 ; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2160,23 +2160,23 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
-; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
-; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
 ; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
 ; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
 ; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
 ; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2206,19 +2206,19 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
 ; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2228,19 +2228,19 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
 ; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2268,23 +2268,23 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
 ; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2293,23 +2293,23 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
-; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
@@ -2339,19 +2339,19 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
 ; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
 ; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
-; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2361,19 +2361,19 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
 ; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    addl %edx, %ecx # encoding: [0x01,0xd1]
 ; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2401,23 +2401,23 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
-; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
 ; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
-; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
 ; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    addl %eax, %esi # encoding: [0x01,0xc6]
 ; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
 ; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
@@ -2426,23 +2426,23 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
-; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
-; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
-; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
 ; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
 ; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
 ; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
 ; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
 ; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 587d7929e5848..0ee08a43f2657 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __divdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -192,8 +192,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    movq %rax, (%rbx)
 ; X64-NEXT:    imulq %rax, %r14
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    addq %r14, %rdx
 ; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
 ; X64-NEXT:    subq %rax, %r13
 ; X64-NEXT:    sbbq %rcx, %r12

diff  --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 6f67a12f67938..86dd5f32ea6f8 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -124,10 +124,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __udivdi3
@@ -136,10 +136,10 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %ecx, 4(%edx)
 ; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    subl %eax, %esi
 ; X86-NEXT:    sbbl %ecx, %edi
@@ -192,8 +192,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X64-NEXT:    movq %rax, (%rbx)
 ; X64-NEXT:    imulq %rax, %r14
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    addq %r14, %rdx
 ; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
 ; X64-NEXT:    subq %rax, %r13
 ; X64-NEXT:    sbbq %rcx, %r12

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index 80944d29560cb..cd081051467c1 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -793,9 +793,9 @@ define i64 @udiv_i64_3(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -836,9 +836,9 @@ define i64 @udiv_i64_5(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-858993459, %edi, %esi # imm = 0xCCCCCCCD
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -880,9 +880,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    imull $-286331154, %ecx, %ecx # imm = 0xEEEEEEEE
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-286331153, %edi, %ecx # imm = 0xEEEEEEEF
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-286331153, %edi, %esi # imm = 0xEEEEEEEF
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    retl
@@ -924,9 +924,9 @@ define i64 @udiv_i64_17(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-252645135, %edi, %esi # imm = 0xF0F0F0F1
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -969,9 +969,9 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    imull $-16843010, %ecx, %ecx # imm = 0xFEFEFEFE
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-16843009, %esi, %ecx # imm = 0xFEFEFEFF
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-16843009, %esi, %esi # imm = 0xFEFEFEFF
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
@@ -1012,9 +1012,9 @@ define i64 @udiv_i64_257(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-16711935, %edi, %ecx # imm = 0xFF00FF01
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-16711935, %edi, %esi # imm = 0xFF00FF01
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
@@ -1148,9 +1148,9 @@ define i64 @udiv_i64_12(i64 %x) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 2e13776715e5b..c80e8d105b342 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -482,8 +482,8 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -505,8 +505,8 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -532,8 +532,8 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -555,8 +555,8 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -584,8 +584,8 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -609,8 +609,8 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -638,8 +638,8 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -663,8 +663,8 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -694,8 +694,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -721,8 +721,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -750,8 +750,8 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -775,8 +775,8 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -806,8 +806,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
 ; X86-64-NEXT:    movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -833,8 +833,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
 ; WIN64-NEXT:    movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -862,8 +862,8 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -887,8 +887,8 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:
@@ -916,8 +916,8 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
 ; X86-64-NEXT:    imulq %rdi, %rcx
 ; X86-64-NEXT:    movq %rdi, %rax
 ; X86-64-NEXT:    mulq %r8
-; X86-64-NEXT:    addq %rcx, %rdx
 ; X86-64-NEXT:    imulq %rsi, %r8
+; X86-64-NEXT:    addq %rcx, %r8
 ; X86-64-NEXT:    addq %r8, %rdx
 ; X86-64-NEXT:    retq
 ;
@@ -941,8 +941,8 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
 ; WIN64-NEXT:    imulq %rcx, %r9
 ; WIN64-NEXT:    movq %rcx, %rax
 ; WIN64-NEXT:    mulq %r10
-; WIN64-NEXT:    addq %r9, %rdx
 ; WIN64-NEXT:    imulq %r10, %r8
+; WIN64-NEXT:    addq %r9, %r8
 ; WIN64-NEXT:    addq %r8, %rdx
 ; WIN64-NEXT:    retq
 entry:

diff  --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll
index a4dc03471fc30..26396c3fc66a2 100644
--- a/llvm/test/CodeGen/X86/fold-add.ll
+++ b/llvm/test/CodeGen/X86/fold-add.ll
@@ -97,10 +97,10 @@ define dso_local i64 @neg_0x80000001() #0 {
 ; MSTATIC-NEXT:  movabsq $-2147483649, %rcx
 ; MSTATIC-NEXT:  movabsq $foo, %rax
 ; MSTATIC-NEXT:  addq %rcx, %rax
-; MPIC-NEXT:     leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; MPIC-NEXT:     movabsq $foo at GOTOFF, %rcx
-; MPIC-NEXT:     addq %rax, %rcx
+; MPIC-NEXT:     leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
+; MPIC-NEXT:     movabsq $foo at GOTOFF, %rdx
 ; MPIC-NEXT:     movabsq $-2147483649, %rax
+; MPIC-NEXT:     addq %rdx, %rax
 ; MPIC-NEXT:     addq %rcx, %rax
 entry:
   ret i64 add (i64 ptrtoint (ptr @foo to i64), i64 -2147483649)

diff  --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index a8636a3496dc4..a08133bb7af48 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -24,87 +24,85 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    .cfi_offset %esi, -20
 ; CHECK-NEXT:    .cfi_offset %edi, -16
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
-; CHECK-NEXT:    movl $668265295, %ecx # imm = 0x27D4EB4F
-; CHECK-NEXT:    movl a, %edi
-; CHECK-NEXT:    cmpl $0, (%edi)
+; CHECK-NEXT:    movl $-1028477379, %edi # imm = 0xC2B2AE3D
+; CHECK-NEXT:    movl $668265295, %ebx # imm = 0x27D4EB4F
+; CHECK-NEXT:    movl a, %eax
+; CHECK-NEXT:    cmpl $0, (%eax)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    movl 8(%edi), %esi
-; CHECK-NEXT:    movl 12(%edi), %eax
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    shldl $1, %esi, %edx
-; CHECK-NEXT:    orl %eax, %edx
-; CHECK-NEXT:    leal (%esi,%esi), %eax
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%edi), %ebx
-; CHECK-NEXT:    movl 20(%edi), %esi
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    shldl $2, %ebx, %eax
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    movl %esi, %ebx
-; CHECK-NEXT:    shldl $31, %eax, %ebx
-; CHECK-NEXT:    shll $2, %eax
-; CHECK-NEXT:    orl %ebx, %eax
+; CHECK-NEXT:    movl 8(%eax), %edi
+; CHECK-NEXT:    movl 12(%eax), %esi
+; CHECK-NEXT:    movl %esi, %edx
+; CHECK-NEXT:    shldl $1, %edi, %edx
+; CHECK-NEXT:    orl %esi, %edx
+; CHECK-NEXT:    leal (%edi,%edi), %ecx
+; CHECK-NEXT:    orl %edi, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 16(%eax), %ecx
+; CHECK-NEXT:    movl 20(%eax), %esi
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shldl $2, %ecx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %edi
+; CHECK-NEXT:    shldl $31, %ecx, %edi
+; CHECK-NEXT:    shll $2, %ecx
+; CHECK-NEXT:    orl %edi, %ecx
 ; CHECK-NEXT:    shrl %esi
 ; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    adcl %edx, %esi
-; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 24(%edi), %eax
+; CHECK-NEXT:    movl 28(%eax), %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl 24(%eax), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
+; CHECK-NEXT:    movl $-1028477379, %ecx # imm = 0xC2B2AE3D
+; CHECK-NEXT:    imull %eax, %ecx
+; CHECK-NEXT:    mull %ebx
+; CHECK-NEXT:    movl %eax, %edi
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; CHECK-NEXT:    imull %eax, %ebx
-; CHECK-NEXT:    mull %ecx
-; CHECK-NEXT:    movl %eax, %esi
-; CHECK-NEXT:    addl %ebx, %edx
-; CHECK-NEXT:    movl 28(%edi), %edi
-; CHECK-NEXT:    imull %edi, %ecx
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %ecx, %ebx
+; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:    imull $1336530590, %eax, %ecx # imm = 0x4FA9D69E
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    imull $-2056954758, %edx, %eax # imm = 0x85655C7A
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    movl $1336530590, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; CHECK-NEXT:    movl %ebx, %eax
 ; CHECK-NEXT:    mull %edx
-; CHECK-NEXT:    imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
-; CHECK-NEXT:    addl %edx, %ebx
-; CHECK-NEXT:    imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT:    addl %ebx, %edx
-; CHECK-NEXT:    shrdl $3, %ecx, %esi
-; CHECK-NEXT:    sarl $3, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
-; CHECK-NEXT:    orl %eax, %esi
-; CHECK-NEXT:    movl $-66860409, %ebx # imm = 0xFC03CA87
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    mull %ebx
-; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    imull $326129324, %esi, %eax # imm = 0x137056AC
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    shrdl $3, %ebx, %edi
+; CHECK-NEXT:    sarl $3, %ebx
+; CHECK-NEXT:    orl %ecx, %ebx
+; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    imull $326129324, %edi, %eax # imm = 0x137056AC
+; CHECK-NEXT:    imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87
 ; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; CHECK-NEXT:    movl %edi, b
+; CHECK-NEXT:    movl $-66860409, %ebx # imm = 0xFC03CA87
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    mull %ebx
-; CHECK-NEXT:    imull $326129324, %edi, %esi # imm = 0x137056AC
-; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    xorl %esi, %ecx
+; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; CHECK-NEXT:    movl %ecx, b+4
+; CHECK-NEXT:    imull $326129324, %eax, %edx # imm = 0x137056AC
 ; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    movl %eax, b
+; CHECK-NEXT:    mull %ebx
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.else
-; CHECK-NEXT:    xorl b+4, %ebx
-; CHECK-NEXT:    xorl b, %ecx
-; CHECK-NEXT:    movl $1419758215, %edx # imm = 0x549FCA87
-; CHECK-NEXT:    movl %ecx, %eax
-; CHECK-NEXT:    mull %edx
-; CHECK-NEXT:    imull $93298681, %ecx, %esi # imm = 0x58F9FF9
-; CHECK-NEXT:    addl %edx, %esi
-; CHECK-NEXT:    imull $1419758215, %ebx, %ecx # imm = 0x549FCA87
-; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    xorl b+4, %edi
+; CHECK-NEXT:    xorl b, %ebx
+; CHECK-NEXT:    movl $1419758215, %ecx # imm = 0x549FCA87
+; CHECK-NEXT:    movl %ebx, %eax
+; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    imull $93298681, %ebx, %esi # imm = 0x58F9FF9
+; CHECK-NEXT:    imull $1419758215, %edi, %ecx # imm = 0x549FCA87
 ; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    addl $-1028477341, %eax # imm = 0xC2B2AE63
 ; CHECK-NEXT:    adcl $-2048144777, %ecx # imm = 0x85EBCA77
 ; CHECK-NEXT:    movl %eax, b

diff  --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll
index 07d85d260a37a..7ed6ddf267a8e 100644
--- a/llvm/test/CodeGen/X86/h-registers-1.ll
+++ b/llvm/test/CodeGen/X86/h-registers-1.ll
@@ -30,11 +30,11 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
 ; CHECK-NEXT:    addq %rdi, %rsi
 ; CHECK-NEXT:    addq %rbp, %rdx
-; CHECK-NEXT:    addq %rsi, %rdx
 ; CHECK-NEXT:    addq %rbx, %rcx
 ; CHECK-NEXT:    addq %r8, %rax
 ; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    addq %rsi, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbp
@@ -63,11 +63,11 @@ define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
 ; GNUX32-NEXT:    movzbl {{[0-9]+}}(%esp), %r8d
 ; GNUX32-NEXT:    addq %rdi, %rsi
 ; GNUX32-NEXT:    addq %rbp, %rdx
-; GNUX32-NEXT:    addq %rsi, %rdx
 ; GNUX32-NEXT:    addq %rbx, %rcx
 ; GNUX32-NEXT:    addq %r8, %rax
 ; GNUX32-NEXT:    addq %rcx, %rax
 ; GNUX32-NEXT:    addq %rdx, %rax
+; GNUX32-NEXT:    addq %rsi, %rax
 ; GNUX32-NEXT:    popq %rbx
 ; GNUX32-NEXT:    .cfi_def_cfa_offset 16
 ; GNUX32-NEXT:    popq %rbp

diff  --git a/llvm/test/CodeGen/X86/hipe-cc.ll b/llvm/test/CodeGen/X86/hipe-cc.ll
index 83393c8c128f7..fd27f97425371 100644
--- a/llvm/test/CodeGen/X86/hipe-cc.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc.ll
@@ -21,7 +21,7 @@ entry:
 
 define cc 11 {i32, i32, i32} @addfour(i32 %hp, i32 %p, i32 %x, i32 %y, i32 %z) nounwind {
 entry:
-  ; CHECK:      addl %edx, %eax
+  ; CHECK:      addl %edx, %ecx
   ; CHECK-NEXT: addl %ecx, %eax
   %0 = add i32 %x, %y
   %1 = add i32 %0, %z

diff  --git a/llvm/test/CodeGen/X86/hipe-cc64.ll b/llvm/test/CodeGen/X86/hipe-cc64.ll
index 29741ae76db46..fec3699650124 100644
--- a/llvm/test/CodeGen/X86/hipe-cc64.ll
+++ b/llvm/test/CodeGen/X86/hipe-cc64.ll
@@ -23,9 +23,9 @@ entry:
 
 define cc 11 {i64, i64, i64} @addfour(i64 %hp, i64 %p, i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 entry:
-  ; CHECK:      leaq (%rsi,%rdx), %rax
-  ; CHECK-NEXT: addq %rcx, %rax
+  ; CHECK:      leaq (%rdx,%rcx), %rax
   ; CHECK-NEXT: addq %r8, %rax
+  ; CHECK-NEXT: addq %rsi, %rax
   %0 = add i64 %x, %y
   %1 = add i64 %0, %z
   %2 = add i64 %1, %w

diff  --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll
index 9131688c4efcc..94d786a65ffa8 100644
--- a/llvm/test/CodeGen/X86/imul.ll
+++ b/llvm/test/CodeGen/X86/imul.ll
@@ -450,13 +450,18 @@ define i64 @test6(i64 %a) {
 ;
 ; X86-LABEL: test6:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll $5, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shll $5, %esi
 ; X86-NEXT:    movl $33, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 entry:
 	%tmp3 = mul i64 %a, 33

diff  --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
index 8fe4516fe959e..d63fdcbca79ed 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
@@ -71,13 +71,13 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
 ; X64-NEXT:  # %bb.2: # %exit
 ; X64-NEXT:    addl %eax, %ecx
 ; X64-NEXT:    leal 1(%rax,%rcx), %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    addl %eax, %ecx
-; X64-NEXT:    movl %ecx, 16(%rdi)
+; X64-NEXT:    leal (%rax,%rax), %edx
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %eax, %edx
+; X64-NEXT:    addl %ecx, %edx
+; X64-NEXT:    movl %edx, 16(%rdi)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: foo_loop:
@@ -102,13 +102,13 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
 ; X86-NEXT:  # %bb.2: # %exit
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    leal 1(%ecx,%esi), %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, 16(%eax)
+; X86-NEXT:    leal (%ecx,%ecx), %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %esi, 16(%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll
index cec19dcf49c8d..b076188c4e918 100644
--- a/llvm/test/CodeGen/X86/lea-opt2.ll
+++ b/llvm/test/CodeGen/X86/lea-opt2.ll
@@ -35,13 +35,11 @@ entry:
 define i32 @test2(ptr %p, i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    movl %ecx, (%rdi)
 ; CHECK-NEXT:    subl %edx, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i32 %a, %b
@@ -55,13 +53,11 @@ entry:
 define i32 @test3(ptr %p, i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    movl %ecx, (%rdi)
 ; CHECK-NEXT:    subl %edx, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
 ; CHECK-NEXT:    retq
 entry:
   %0 = add i32 %a, %b
@@ -114,8 +110,8 @@ define i64 @test6(ptr %p, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    addq %rdx, %rcx
 ; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    addq %rdx, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rdi)
 ; CHECK-NEXT:    subq %rdx, %rax
 ; CHECK-NEXT:    retq
@@ -132,8 +128,8 @@ define i64 @test7(ptr %p, i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test7:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq (%rdi), %rax
-; CHECK-NEXT:    addq %rdx, %rcx
 ; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    addq %rdx, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rdi)
 ; CHECK-NEXT:    subq %rdx, %rax
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/lrshrink.ll b/llvm/test/CodeGen/X86/lrshrink.ll
index 51f675d245190..b72a5a413c573 100644
--- a/llvm/test/CodeGen/X86/lrshrink.ll
+++ b/llvm/test/CodeGen/X86/lrshrink.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell | FileCheck %s
 
 ; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call
 ; to minimize live-range.
@@ -33,10 +33,10 @@ define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) {
 ; CHECK-NEXT:    addq %r14, %r15
 ; CHECK-NEXT:    callq _Z3foov at PLT
 ; CHECK-NEXT:    movl %eax, %r14d
-; CHECK-NEXT:    addq %r15, %r14
 ; CHECK-NEXT:    callq _Z3foov at PLT
 ; CHECK-NEXT:    movl %eax, %eax
 ; CHECK-NEXT:    addq %r14, %rax
+; CHECK-NEXT:    addq %r15, %rax
 ; CHECK-NEXT:    addq %rbx, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24

diff  --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index 601166d67f6f2..11c216e67bc82 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -304,40 +304,41 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    jl .LBB5_1
 ; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:  .LBB5_3:
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -376,42 +377,43 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    testb %dl, %dl
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    jne .LBB6_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    jmp .LBB6_3
 ; X86-NEXT:  .LBB6_1:
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:  .LBB6_3:
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl $1, %ebx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ebp, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
-; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    imull %eax, %edx
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -456,39 +458,39 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    movl 4(%edx), %esi
+; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    movzbl %dl, %edx
 ; X86-NEXT:    jl .LBB7_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    jmp .LBB7_3
 ; X86-NEXT:  .LBB7_1:
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB7_3:
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    subl %ebx, %eax
 ; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -531,41 +533,42 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %ebp, %edx
-; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %eax
+; X86-NEXT:    movl 4(%ecx), %edi
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    movzbl %cl, %edx
 ; X86-NEXT:    jl .LBB8_1
 ; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    jmp .LBB8_3
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -610,40 +613,40 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %esi
-; X86-NEXT:    movl 4(%eax), %ecx
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %esi
 ; X86-NEXT:    movl (%edx), %eax
 ; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    setl %dl
-; X86-NEXT:    movzbl %dl, %ebx
+; X86-NEXT:    movzbl %dl, %edx
 ; X86-NEXT:    jl .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    negl %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negl %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    orl $1, %ebp
-; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    subl %ebx, %eax
 ; X86-NEXT:    sbbl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    shrdl $1, %edi, %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    shrl %edi
+; X86-NEXT:    imull %eax, %edx
 ; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/misched-balance.ll b/llvm/test/CodeGen/X86/misched-balance.ll
index aef363f11bbe2..02d1425360159 100644
--- a/llvm/test/CodeGen/X86/misched-balance.ll
+++ b/llvm/test/CodeGen/X86/misched-balance.ll
@@ -9,8 +9,8 @@ define void @unrolled_mmult1(ptr %tmp55, ptr %tmp56, ptr %pre, ptr %pre94,
 entry:
   br label %for.body
 
-; imull folded loads should be in order and interleaved with addl, never
-; adjacent. Also check that we have no spilling.
+; imull folded loads should be in order, addl may be reordered to reduce total
+; latency. Also check that we have no spilling.
 ;
 ; Since mmult1 IR is already in good order, this effectively ensure
 ; the scheduler maintains source order.
@@ -22,28 +22,28 @@ entry:
 ; CHECK: addl
 ; CHECK: imull 8
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 12
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK: imull 16
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 20
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK: imull 24
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK: imull 28
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 32
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK: imull 36
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK-LABEL: %end
 for.body:
@@ -127,28 +127,28 @@ end:
 ; CHECK: addl
 ; CHECK: imull 8
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 12
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK: imull 16
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 20
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK: imull 24
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK: imull 28
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: addl
 ; CHECK: imull 32
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK: imull 36
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
+; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK-LABEL: %end
 define void @unrolled_mmult2(ptr %tmp55, ptr %tmp56, ptr %pre, ptr %pre94,

diff  --git a/llvm/test/CodeGen/X86/misched-matrix.ll b/llvm/test/CodeGen/X86/misched-matrix.ll
index fa56d6bf226d1..178bea92867ee 100644
--- a/llvm/test/CodeGen/X86/misched-matrix.ll
+++ b/llvm/test/CodeGen/X86/misched-matrix.ll
@@ -33,25 +33,25 @@
 ; ILPMIN: imull
 ; ILPMIN: addl
 ; ILPMIN: imull
-; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
+; ILPMIN: addl
 ; ILPMIN: movl %{{.*}}, 4(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
 ; ILPMIN: imull
-; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
+; ILPMIN: addl
 ; ILPMIN: movl %{{.*}}, 8(
 ; ILPMIN: imull
 ; ILPMIN: imull
 ; ILPMIN: addl
 ; ILPMIN: imull
-; ILPMIN: addl
 ; ILPMIN: imull
 ; ILPMIN: addl
+; ILPMIN: addl
 ; ILPMIN: movl %{{.*}}, 12(
 ; ILPMIN-LABEL: %for.end
 ;

diff  --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index 267ebca88e31e..f04b89f54e3a3 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -558,10 +558,10 @@ define i16 @test_mul_by_28(i16 %x) {
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index b6791a84ffa45..44c140857f4c2 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -900,10 +900,10 @@ define i32 @test_mul_by_28(i32 %x) {
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
@@ -1179,6 +1179,7 @@ define i32 @test_mul_by_66(i32 %x) {
 ;
 ; X64-SLM-LABEL: test_mul_by_66:
 ; X64-SLM:       # %bb.0:
+; X64-SLM-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-SLM-NEXT:    movl %edi, %eax
 ; X64-SLM-NEXT:    shll $6, %eax
 ; X64-SLM-NEXT:    addl %edi, %eax

diff  --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll
index 94e09011fda6c..c65858586f0db 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i64.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll
@@ -498,13 +498,18 @@ define i64 @test_mul_by_16(i64 %x) {
 define i64 @test_mul_by_17(i64 %x) {
 ; X86-LABEL: test_mul_by_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    shll $4, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shll $4, %esi
 ; X86-NEXT:    movl $17, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_17:
@@ -685,13 +690,18 @@ define i64 @test_mul_by_21(i64 %x) {
 define i64 @test_mul_by_22(i64 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,4), %ecx
-; X86-NEXT:    leal (%eax,%ecx,4), %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %esi
 ; X86-NEXT:    movl $22, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_22:
@@ -844,13 +854,18 @@ define i64 @test_mul_by_25(i64 %x) {
 define i64 @test_mul_by_26(i64 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,4), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %esi
 ; X86-NEXT:    movl $26, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_26:
@@ -924,13 +939,18 @@ define i64 @test_mul_by_27(i64 %x) {
 define i64 @test_mul_by_28(i64 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,8), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %esi
 ; X86-NEXT:    movl $28, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_28:
@@ -971,14 +991,19 @@ define i64 @test_mul_by_28(i64 %x) {
 define i64 @test_mul_by_29(i64 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal (%eax,%eax,8), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %esi
+; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
@@ -1501,8 +1526,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1537,8 +1562,8 @@ define i64 @test_mul_spec(i64 %x) nounwind {
 ; X86-NOOPT-NEXT:    movl %esi, %eax
 ; X86-NOOPT-NEXT:    mull %edi
 ; X86-NOOPT-NEXT:    imull %esi, %ebx
-; X86-NOOPT-NEXT:    addl %ebx, %edx
 ; X86-NOOPT-NEXT:    imull %ecx, %edi
+; X86-NOOPT-NEXT:    addl %ebx, %edi
 ; X86-NOOPT-NEXT:    addl %edi, %edx
 ; X86-NOOPT-NEXT:    popl %esi
 ; X86-NOOPT-NEXT:    popl %edi

diff  --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index 015137b3d350c..fc40f50ab4e5a 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -10,112 +10,111 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    subl $400, %esp # imm = 0x190
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 60(%ecx), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 56(%ecx), %edi
+; X32-NEXT:    movl 60(%ecx), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%eax), %ebp
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl 56(%ecx), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%ebp), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 4(%eax), %ecx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl 4(%ebp), %ecx
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %edi
+; X32-NEXT:    movl %ecx, %ebp
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 48(%edi), %esi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl 48(%esi), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 52(%edi), %eax
+; X32-NEXT:    movl 52(%esi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 12(%eax), %edi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebp, %ebx
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    movl 12(%eax), %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -123,166 +122,165 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 40(%esi), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 44(%esi), %ebp
+; X32-NEXT:    movl 40(%esi), %ebp
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl 44(%esi), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl 32(%ebp), %edi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl 32(%edi), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 36(%ebp), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl 36(%edi), %edi
 ; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    movl %edi, %esi
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %edi, %eax
@@ -290,7 +288,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -300,67 +298,66 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 16(%eax), %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl 16(%eax), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 20(%eax), %edx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    mull %edx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 20(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %edi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -368,77 +365,78 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 24(%eax), %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebp, %edi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %edi, %ebp
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movzbl %bl, %edi
-; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -446,72 +444,71 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -519,73 +516,74 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X32-NEXT:    adcl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -595,116 +593,115 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 24(%esi), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 24(%esi), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 28(%esi), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 28(%esi), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 16(%edi), %esi
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl 16(%esi), %edi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 20(%edi), %eax
+; X32-NEXT:    movl 20(%esi), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -715,89 +712,89 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 8(%esi), %ecx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%esi), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl 12(%esi), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 12(%esi), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl (%ebp), %edi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl (%esi), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 4(%ebp), %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl 4(%esi), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -805,44 +802,43 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebp, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -850,33 +846,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X32-NEXT:    adcl %edi, %eax
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -884,130 +880,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %ebp, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %edi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movzbl %bl, %edi
-; X32-NEXT:    adcl %edi, %edx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -1022,42 +1018,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
@@ -1067,96 +1062,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edi, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -1173,7 +1170,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebx
@@ -1187,9 +1184,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
@@ -1204,53 +1201,50 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl 32(%ebx), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl 32(%ebp), %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 36(%eax), %ecx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %ebx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 36(%ebp), %ebp
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
@@ -1258,7 +1252,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %ebx, %ecx
@@ -1270,23 +1265,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 40(%eax), %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 40(%eax), %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 44(%eax), %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 44(%eax), %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
@@ -1294,16 +1290,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    movl %ebx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1320,20 +1317,20 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1341,61 +1338,60 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1404,76 +1400,75 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1483,33 +1478,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 48(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 48(%eax), %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ecx
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 52(%eax), %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    mull %edx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
@@ -1517,29 +1513,29 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1554,34 +1550,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 60(%eax), %ecx
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1593,25 +1589,25 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %esi
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -1625,7 +1621,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %ebp, %ecx
 ; X32-NEXT:    adcl $0, %ecx
@@ -1668,27 +1664,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
@@ -1708,30 +1704,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -1743,30 +1739,30 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    movl %ecx, %edi
-; X32-NEXT:    adcl %eax, %edi
+; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1779,10 +1775,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebx
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl %eax, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1793,7 +1790,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -1806,6 +1803,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1849,55 +1847,54 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -1905,24 +1902,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -1930,7 +1927,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -1950,7 +1947,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1960,268 +1957,267 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movl %ebp, %esi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ebx, %eax
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebp
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -2234,62 +2230,63 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %edi, %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
@@ -2299,82 +2296,83 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebx, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    addl %ebx, %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    adcl %eax, %ebp
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2386,14 +2384,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2420,143 +2418,141 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 64(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 68(%eax), %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 68(%eax), %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ecx
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ebp
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    setb %cl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 72(%eax), %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl 72(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 76(%eax), %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
+; X32-NEXT:    movl 76(%eax), %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebp
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %edi, %ebp
+; X32-NEXT:    adcl %ebp, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -2566,97 +2562,96 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movzbl %bl, %eax
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2664,42 +2659,44 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movzbl %bl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl %bl, %ebx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X32-NEXT:    adcl %edi, %eax
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ebx, %eax
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2707,64 +2704,63 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 80(%eax), %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 84(%eax), %ecx
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ecx, %ebp
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edi
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    adcl %ebp, %edi
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2772,38 +2768,39 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 88(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 92(%eax), %edi
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2816,26 +2813,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %edi, %ecx
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %esi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    adcl $0, %edx
@@ -2848,15 +2845,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %edi, %edx
+; X32-NEXT:    movl %ebx, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebp, %ecx
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
@@ -2866,130 +2864,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %esi, %edi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebx, %ecx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    adcl %eax, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2999,14 +2997,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -3041,7 +3039,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
@@ -3088,7 +3086,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
@@ -3137,56 +3135,52 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %esi, %ebp
+; X32-NEXT:    addl %edx, %ebp
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %edx, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl %ebx, %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %ebp, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
-; X32-NEXT:    setb %cl
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3196,50 +3190,48 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %ebp
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    imull %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %ecx, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb %cl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -3248,79 +3240,79 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 104(%esi), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 104(%ecx), %ebp
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 108(%esi), %esi
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
-; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl 108(%ecx), %ecx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebx, %edi
+; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %ebx, %edi
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 96(%esi), %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 96(%ecx), %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl 100(%esi), %eax
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 100(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3331,32 +3323,31 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ebx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3367,21 +3358,21 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %edi, %ebx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %esi, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl %ebx, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -3389,101 +3380,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 112(%ecx), %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl 112(%ecx), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 116(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %ebx
+; X32-NEXT:    addl %edi, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 120(%ecx), %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ecx, %esi
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %esi, %ecx
+; X32-NEXT:    movl 124(%edx), %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl 124(%esi), %esi
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %ebx, %edi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %ebp
+; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %bl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movzbl %bl, %esi
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edx
+; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %esi, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
+; X32-NEXT:    imull %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3492,9 +3480,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movzbl %bl, %edi
 ; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -3516,7 +3505,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -3553,7 +3542,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
@@ -3644,7 +3633,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
@@ -3664,7 +3653,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ebx, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -3689,7 +3678,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
@@ -3719,7 +3708,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %edi, %ebp
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
@@ -3814,7 +3803,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3959,7 +3948,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3994,7 +3983,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -4017,7 +4006,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl %edi, %ebp
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
@@ -4027,18 +4015,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %esi, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -4060,9 +4048,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -4093,7 +4081,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %ebx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4120,7 +4108,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
@@ -4152,7 +4140,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
@@ -4173,14 +4161,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 104(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
@@ -4202,7 +4190,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    adcl %ebp, %ecx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %esi
@@ -4219,7 +4207,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4251,56 +4239,53 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %eax, %edi
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    imull %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %esi
+; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4309,52 +4294,51 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    movl 120(%edi), %esi
-; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl 112(%edi), %edi
-; X32-NEXT:    movl 116(%ebx), %ebp
+; X32-NEXT:    movl 112(%edi), %ecx
+; X32-NEXT:    movl 116(%edi), %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    imull %ebp, %ebx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %ebx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    imull %edi, %edx
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %ebx
+; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edx, %ebx
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -4364,7 +4348,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4393,7 +4377,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4427,7 +4411,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
@@ -4461,9 +4445,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -4492,101 +4476,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    imull %eax, %esi
+; X32-NEXT:    imull %eax, %edi
+; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %edi
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %edi, %edx
+; X32-NEXT:    imull %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    imull %esi, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %edi
 ; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %ebp, %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movzbl %cl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %esi, %ecx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    imull %ebp, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, %ebx
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebp, %edi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %ebp, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
@@ -4602,7 +4583,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4624,9 +4605,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -4637,9 +4618,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4677,9 +4658,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4717,7 +4698,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4804,201 +4785,200 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    subq $240, %rsp
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rsi, %rbp
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 40(%rdi), %r12
-; X64-NEXT:    movq 32(%rdi), %r14
-; X64-NEXT:    movq 56(%rdi), %r15
+; X64-NEXT:    movq 40(%rdi), %rbx
+; X64-NEXT:    movq 32(%rdi), %r12
+; X64-NEXT:    movq 56(%rdi), %r14
 ; X64-NEXT:    movq 48(%rdi), %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq (%rsi), %r11
-; X64-NEXT:    movq 8(%rsi), %r8
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r9, %rcx
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r9, %r8
+; X64-NEXT:    adcq %rdi, %r10
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %r9d
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r10, %rsi
-; X64-NEXT:    adcq %r9, %r13
 ; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r10, %rdi
+; X64-NEXT:    adcq %r9, %r15
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %r9, %r11
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r9, %rbx
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq %rcx, %r11
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%rbp), %r15
-; X64-NEXT:    movq %r14, %r10
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    addq %rbp, %rbx
+; X64-NEXT:    adcq %r8, %r11
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 16(%r13), %rcx
 ; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, %rbp
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rdi, %r14
+; X64-NEXT:    addq %r8, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 24(%rbp), %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %rbp
+; X64-NEXT:    movq 24(%r13), %r13
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    adcq %r9, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %r12, %r9
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    addq %rbx, %r8
-; X64-NEXT:    movq %r8, (%rsp) # 8-byte Spill
+; X64-NEXT:    adcq %rax, %r8
+; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r11, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, (%rsp) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq %r13, %rdi
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    adcq %r15, %r8
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq %rsi, %r11
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %sil
+; X64-NEXT:    adcq %rdi, %rsi
+; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movzbl %sil, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %dil, %esi
+; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    addq %r9, %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rdi, %r11
+; X64-NEXT:    adcq %r8, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r10b, %ecx
-; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    movzbl %r10b, %esi
+; X64-NEXT:    adcq %rsi, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 16(%r8), %rsi
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq 16(%rcx), %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 24(%r8), %r14
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq 24(%rcx), %r14
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %r13, %r15
+; X64-NEXT:    movq %r13, %rbx
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rsi, %r8
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r11, %rsi
-; X64-NEXT:    adcq %rdi, %rbx
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    adcq %rdi, %r11
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %r13, %r12
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rbx, %rdi
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r11, %r8
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq (%r8), %r13
+; X64-NEXT:    adcq %rax, %rbp
+; X64-NEXT:    movq (%rcx), %r13
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq 8(%r8), %rbp
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq 8(%rcx), %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r11, %r14
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq %r12, %r11
+; X64-NEXT:    movq %r12, %rcx
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r12, %rbx
@@ -5006,31 +4986,32 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r11
 ; X64-NEXT:    addq %r9, %rbx
 ; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r13, %r10
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %r13, %rcx
 ; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rsi, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    adcq %r9, %r12
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r12, %rsi
@@ -5042,202 +5023,199 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    adcq %rcx, %r9
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    adcq %rbp, %r9
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r8
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rbx, %rcx
-; X64-NEXT:    adcq %r11, %r14
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    adcq %r8, %rbx
 ; X64-NEXT:    setb %r11b
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movzbl %r11b, %edi
 ; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    addq %rsi, %r13
+; X64-NEXT:    addq %rsi, %r14
 ; X64-NEXT:    adcq %r9, %rcx
 ; X64-NEXT:    movzbl %r10b, %esi
 ; X64-NEXT:    adcq %rsi, %rax
 ; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq (%rsp), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    adcq (%rsp), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 32(%rcx), %rdi
-; X64-NEXT:    movq %r8, %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq 32(%r10), %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r12, %r8
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r14
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 40(%rcx), %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq 40(%r10), %rcx
+; X64-NEXT:    movq %r10, %rdi
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r12
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    adcq %r9, %rbx
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rbx, %r11
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %rbx, %r13
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r10
-; X64-NEXT:    setb %r15b
-; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    movq %rbp, %rdi
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    adcq %r14, %r15
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r10, %r14
-; X64-NEXT:    movzbl %r15b, %eax
+; X64-NEXT:    addq %r15, %r14
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    adcq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq 48(%rcx), %rcx
-; X64-NEXT:    movq %r12, %r15
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq 48(%rdi), %r12
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r12
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq 56(%r8), %rsi
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq 56(%rdi), %rsi
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    adcq %r10, %r15
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    adcq %r15, %rcx
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r15, %r13
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    addq %rcx, %r13
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    addq %r14, %rbp
-; X64-NEXT:    movq %rbp, %r8
+; X64-NEXT:    movq %rbp, %r10
 ; X64-NEXT:    adcq %rbx, %rdi
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq %r11, %r13
 ; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    setb %bpl
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r9, %rbx
-; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rbx, %r9
-; X64-NEXT:    adcq %r10, %r15
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rbx, %r12
+; X64-NEXT:    adcq %r9, %rcx
+; X64-NEXT:    setb %r15b
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r15, %rbx
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movzbl %r15b, %eax
 ; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq %r13, %r12
-; X64-NEXT:    adcq %rsi, %r9
-; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    addq %r13, %rbp
+; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %rbx
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    adcq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
@@ -5246,127 +5224,127 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r10, %r12
-; X64-NEXT:    adcq %rsi, %r15
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    addq %rdi, %r12
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r15, %rsi
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r10, %r13
+; X64-NEXT:    addq %rcx, %r13
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r10, %r11
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %r10
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    adcq %r15, %rcx
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    movq %rbp, %r15
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r10, %rbp
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r13
-; X64-NEXT:    addq %rdi, %rbp
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r12, %r13
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r11, %r8
+; X64-NEXT:    movq %r11, %r10
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    adcq %r10, %r8
+; X64-NEXT:    adcq %r8, %rcx
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    addq %rcx, %r12
 ; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
+; X64-NEXT:    adcq %rax, %r8
 ; X64-NEXT:    addq %rbp, %r11
 ; X64-NEXT:    adcq %r13, %r15
 ; X64-NEXT:    movq %r15, %rbp
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    addq %rsi, %r12
-; X64-NEXT:    adcq %r9, %rdi
-; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    addq %rdi, %r12
+; X64-NEXT:    adcq %r9, %r8
+; X64-NEXT:    setb %r9b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r12, %r10
-; X64-NEXT:    adcq %rdi, %r8
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X64-NEXT:    adcq %r8, %rdi
+; X64-NEXT:    movzbl %r9b, %ecx
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    adcq $0, %rdx
@@ -5381,350 +5359,342 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq 64(%r9), %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq 64(%r10), %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rcx, %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 72(%r9), %rsi
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq 72(%r10), %rcx
+; X64-NEXT:    movq %r10, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    adcq %rdi, %r10
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    addq %r9, %rbx
+; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r13
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r10, %r9
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %r8
+; X64-NEXT:    adcq %r14, %rcx
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r8, %rbp
+; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    addq %r11, %rbp
 ; X64-NEXT:    adcq %rbx, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rcx, %rbx
-; X64-NEXT:    movq 80(%rcx), %r15
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq 80(%rsi), %r15
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r8, %r11
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq 88(%rbx), %rbx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq 88(%r10), %r10
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    adcq %r11, %rcx
+; X64-NEXT:    setb %r11b
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    addq %rcx, %r13
+; X64-NEXT:    movzbl %r11b, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbp, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    addq %r9, %r13
-; X64-NEXT:    adcq %rsi, %r12
-; X64-NEXT:    setb %bpl
+; X64-NEXT:    adcq %rdi, %r12
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rdi, %r10
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %r14
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    adcq %r8, %rdi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    adcq %r8, %rcx
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %r8b, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    addq %r13, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r12, %rsi
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %bpl, %ecx
+; X64-NEXT:    movzbl %r9b, %ecx
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rbx
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    imulq %rax, %r10
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rbx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r15
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    imulq %rsi, %r15
+; X64-NEXT:    addq %r10, %r15
 ; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r10, %rdx
+; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %r9, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %r8, %rdi
-; X64-NEXT:    adcq %r15, %rbx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    imulq %rbx, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    imulq %rdi, %r10
+; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq %r15, %r10
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %r15, %r13
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r8, %r15
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r15, %r11
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    addq %r13, %r15
+; X64-NEXT:    adcq %r12, %rbp
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r8, %r15
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rbp, %r12
+; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq %rbx, %rsi
+; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    adcq %r10, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq 120(%rdx), %rdi
+; X64-NEXT:    movq 120(%rdx), %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %rax, %rdi
-; X64-NEXT:    movq 112(%rdx), %rbx
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    imulq %rax, %rcx
+; X64-NEXT:    movq 112(%rdx), %r9
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    movq 96(%r12), %r10
-; X64-NEXT:    movq 104(%r12), %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT:    imulq %rbx, %r9
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    addq %rdx, %r9
+; X64-NEXT:    movq 96(%rdi), %rbp
+; X64-NEXT:    movq 104(%rdi), %rdi
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %r12
-; X64-NEXT:    imulq %rdi, %r12
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    imulq %rdi, %rax
+; X64-NEXT:    imulq %rbp, %r11
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r12, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %r14
-; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    addq %rdx, %r11
 ; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %rbx, %r14
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    adcq %r9, %r11
+; X64-NEXT:    movq %r11, %r14
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r8, %r12
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r12, %rbx
-; X64-NEXT:    adcq %rbp, %r10
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r9, %r8
+; X64-NEXT:    adcq %rcx, %rbp
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    movzbl %r8b, %edi
-; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    addq %rbp, %rax
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    adcq %r14, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r11, %rbx
-; X64-NEXT:    adcq %r15, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, %r8
+; X64-NEXT:    adcq %r12, %rax
 ; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq 80(%r13), %r8
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq 80(%r8), %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 88(%r13), %r11
-; X64-NEXT:    movq %r13, %r10
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 88(%r8), %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    movq %rsi, %r11
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %r13
-; X64-NEXT:    movq %r10, %rdi
-; X64-NEXT:    movq 64(%r10), %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    movq 64(%r8), %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 72(%rdi), %rax
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq 72(%r8), %rax
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r12
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r11, %r9
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r8
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r12, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r15, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r13, %r11
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbx, %rbp
+; X64-NEXT:    addq %r10, %rbp
 ; X64-NEXT:    adcq %r14, %r12
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %r10, %rdi
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %rbx
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %r14
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r14, %r8
 ; X64-NEXT:    adcq %r10, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
@@ -5733,26 +5703,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rax, %r10
 ; X64-NEXT:    addq %rbp, %r9
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r12, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r12, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq %r13, %r10
+; X64-NEXT:    adcq %rbx, %r10
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r9, %rax
@@ -5764,8 +5734,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %sil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r14, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r14, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %dil, %ecx
@@ -5773,160 +5743,163 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq 96(%rdi), %rsi
-; X64-NEXT:    imulq %rsi, %r15
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r15, %rdx
-; X64-NEXT:    movq 104(%rdi), %r9
-; X64-NEXT:    imulq %r9, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq 112(%rdi), %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    movq 120(%rdi), %rdi
-; X64-NEXT:    imulq %r15, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    addq %r10, %r8
-; X64-NEXT:    adcq %r14, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq 96(%r8), %rcx
+; X64-NEXT:    imulq %rcx, %r15
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r13, %r9
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq 104(%r8), %rdi
+; X64-NEXT:    imulq %rdi, %r9
+; X64-NEXT:    addq %r15, %r9
+; X64-NEXT:    addq %rdx, %r9
+; X64-NEXT:    movq 112(%r8), %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    imulq %r11, %rdx
+; X64-NEXT:    movq 120(%r8), %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    imulq %r10, %r8
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r10, %r13
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    addq %rsi, %r13
+; X64-NEXT:    adcq %r9, %r8
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r10, %r9
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r13, %rbp
-; X64-NEXT:    adcq %r14, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rsi, %r10
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %r10, %r12
+; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rcx, %r14
-; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    adcq %rdi, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    addq %r13, %r14
+; X64-NEXT:    adcq %r8, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    imulq %rax, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    imulq %r15, %rdi
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    imulq %rsi, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    imulq %rdi, %rbx
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    imulq %r12, %rsi
-; X64-NEXT:    addq %rdx, %rsi
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    imulq %r9, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    imulq %r11, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %rcx, %r13
+; X64-NEXT:    addq %r9, %rcx
 ; X64-NEXT:    adcq %r8, %rbx
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rbx, %rbp
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %r8, %rdi
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq %rsi, %r8
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    addq %r8, %rax
 ; X64-NEXT:    movzbl %sil, %esi
 ; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq %rbp, %r11
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    adcq %rbp, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r12, %r11
 ; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, (%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 8(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 16(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 24(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 32(%rsi)
-; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 40(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 48(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 56(%rsi)
-; X64-NEXT:    movq %r8, 64(%rsi)
-; X64-NEXT:    movq %r9, 72(%rsi)
-; X64-NEXT:    movq %r10, 80(%rsi)
-; X64-NEXT:    movq %rbx, 88(%rsi)
-; X64-NEXT:    movq %rcx, 96(%rsi)
-; X64-NEXT:    movq %r11, 104(%rsi)
-; X64-NEXT:    movq %rax, 112(%rsi)
-; X64-NEXT:    movq %rdx, 120(%rsi)
+; X64-NEXT:    movq %rsi, (%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 8(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 16(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 24(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 32(%rcx)
+; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 40(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 48(%rcx)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, 56(%rcx)
+; X64-NEXT:    movq %rdi, 64(%rcx)
+; X64-NEXT:    movq %r8, 72(%rcx)
+; X64-NEXT:    movq %r9, 80(%rcx)
+; X64-NEXT:    movq %r10, 88(%rcx)
+; X64-NEXT:    movq %r13, 96(%rcx)
+; X64-NEXT:    movq %r11, 104(%rcx)
+; X64-NEXT:    movq %rax, 112(%rcx)
+; X64-NEXT:    movq %rdx, 120(%rcx)
 ; X64-NEXT:    addq $240, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12

diff  --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 9382278ff5c2d..434d14ad4abed 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -23,85 +23,85 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 12(%ecx), %esi
-; X32-NEXT:    movl 8(%ecx), %ebx
+; X32-NEXT:    movl 12(%ecx), %ebp
+; X32-NEXT:    movl 8(%ecx), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %ebx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl (%eax), %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 4(%eax), %ebp
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl (%edi), %esi
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl 4(%edi), %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl 4(%edi), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %ecx, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 8(%eax), %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %esi
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -110,55 +110,54 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    movl 12(%eax), %esi
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebx, %ecx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    adcl %edi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebx
-; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -168,43 +167,41 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 16(%ecx), %edi
-; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %edi, %ebx
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl 20(%ecx), %eax
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    imull %eax, %ebp
-; X32-NEXT:    addl %edx, %ebp
+; X32-NEXT:    movl 20(%ecx), %ebp
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %ebx, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 24(%ecx), %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    imull %esi, %edx
 ; X32-NEXT:    movl 28(%ecx), %ecx
-; X32-NEXT:    imull %esi, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    imull %ebx, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl %ebp, %ecx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    mull %ebx
+; X32-NEXT:    addl %edx, %ecx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ebp, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
@@ -216,84 +213,84 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movzbl %bl, %esi
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 28(%edi), %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    movl 24(%edi), %ecx
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl 28(%edi), %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %ecx
+; X32-NEXT:    movl 24(%edi), %esi
+; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl 16(%edi), %ebp
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl %ecx, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    movl 16(%edi), %ecx
 ; X32-NEXT:    movl 20(%edi), %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    imull %ebx, %edi
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    imull %ebx, %edx
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    addl %edi, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    imull %ecx, %edi
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %ecx, %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
-; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %ebx, %esi
-; X32-NEXT:    adcl %ecx, %edi
-; X32-NEXT:    setb %cl
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    addl %esi, %ecx
+; X32-NEXT:    adcl %ebx, %edi
+; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movzbl %cl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
+; X32-NEXT:    movzbl %bl, %esi
+; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, %ebx
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %esi, %ebx
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, (%ecx)
+; X32-NEXT:    movl %edi, (%esi)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 4(%ecx)
+; X32-NEXT:    movl %edi, 4(%esi)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 8(%ecx)
+; X32-NEXT:    movl %edi, 8(%esi)
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, 12(%ecx)
-; X32-NEXT:    movl %ebx, 16(%ecx)
-; X32-NEXT:    movl %esi, 20(%ecx)
-; X32-NEXT:    movl %eax, 24(%ecx)
-; X32-NEXT:    movl %edx, 28(%ecx)
+; X32-NEXT:    movl %edi, 12(%esi)
+; X32-NEXT:    movl %ebx, 16(%esi)
+; X32-NEXT:    movl %ecx, 20(%esi)
+; X32-NEXT:    movl %eax, 24(%esi)
+; X32-NEXT:    movl %edx, 28(%esi)
 ; X32-NEXT:    addl $72, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 20
 ; X32-NEXT:    popl %esi
@@ -312,9 +309,12 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r12, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %rcx
@@ -329,17 +329,17 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r15, %rdx
 ; X64-NEXT:    imulq %r14, %r10
+; X64-NEXT:    addq %r15, %r10
 ; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq %r8, %r15
-; X64-NEXT:    imulq %r11, %r15
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    imulq %r11, %r12
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r15, %rdx
 ; X64-NEXT:    movq 24(%rsi), %r15
 ; X64-NEXT:    imulq %rbx, %r15
+; X64-NEXT:    addq %r12, %r15
 ; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    addq %rdi, %r8
 ; X64-NEXT:    adcq %r10, %r15
@@ -372,6 +372,8 @@ define void @test(ptr %a, ptr %b, ptr %out) #0 {
 ; X64-NEXT:    movq %rax, 16(%rcx)
 ; X64-NEXT:    movq %rdx, 24(%rcx)
 ; X64-NEXT:    popq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    popq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 24
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 08d0f7cd08220..f5254ed37b53d 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -33,7 +33,6 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, %ebp
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -47,73 +46,74 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 16(%ecx), %ebp
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 16(%ecx), %ebx
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl 20(%ecx), %ebx
-; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl 20(%ecx), %eax
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    adcl $0, %edi
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl %ebp, %esi
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb %cl
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    addl %ebp, %eax
-; X32-NEXT:    movzbl %cl, %ecx
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl %ebp, %esi
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl 8(%edi), %ebp
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 8(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl 12(%edi), %ecx
-; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl 12(%eax), %ecx
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    adcl %ebp, %edi
-; X32-NEXT:    setb %bl
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    addl %edi, %ecx
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ecx
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -125,7 +125,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
@@ -145,7 +145,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %esi, %edx
-; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %ecx, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -154,9 +154,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 8(%ecx), %ebx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl 8(%ecx), %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -166,19 +166,20 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ebp, %ebx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -191,66 +192,66 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 4(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %ebx, %esi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %ecx
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    adcl %ebp, %ecx
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %esi
 ; X32-NEXT:    addl %ecx, %esi
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebp
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    adcl %ebp, %edi
+; X32-NEXT:    adcl %edi, %ebp
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %edi, %ebp
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebp
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -262,27 +263,28 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %edi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %edi, %esi
-; X32-NEXT:    setb %bl
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    adcl %ebx, %esi
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movzbl %bl, %ecx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    adcl %ecx, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %ebp, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movl %ebp, %edx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X32-NEXT:    adcl %edi, %eax
@@ -295,111 +297,109 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 16(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl 16(%eax), %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 20(%eax), %ecx
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %ecx, %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl %ecx, %esi
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %edi, %eax
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    adcl %edi, %ecx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ebp, %eax
+; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebp, %ecx
+; X32-NEXT:    addl %esi, %ecx
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 24(%eax), %ebx
-; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    movl 24(%eax), %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ebx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ebp
 ; X32-NEXT:    movl %eax, %edi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl 28(%eax), %ebx
-; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    movl 28(%eax), %esi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    adcl %ebp, %esi
+; X32-NEXT:    adcl %ebp, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %ebp
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %ebx, %ebp
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -407,31 +407,29 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
+; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebp, %ecx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %edx, %ebp
-; X32-NEXT:    addl %edi, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    adcl %edi, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebp, %eax
+; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    addl %ebx, %edi
-; X32-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X32-NEXT:    addl %ebp, %edi
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X32-NEXT:    adcl %ecx, %eax
@@ -447,21 +445,20 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl %edi, %edx
 ; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    adcl $0, %eax
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %ecx
@@ -470,81 +467,80 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
+; X32-NEXT:    movl %eax, %edi
+; X32-NEXT:    addl %ecx, %edi
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    addl %ebx, %eax
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    setb %bl
+; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %ecx, %eax
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movzbl %bl, %eax
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    addl %ecx, %ebx
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %edi
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    adcl %edi, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    mull %esi
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    movzbl %bl, %eax
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edx, %ecx
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl %edi, %ebx
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %edi
-; X32-NEXT:    movl %edi, %ebx
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    addl %ebx, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
 ; X32-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -553,58 +549,59 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl %ebx, %ecx
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    adcl %esi, %ebx
+; X32-NEXT:    adcl %ebx, %esi
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %ebx, %eax
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    adcl %eax, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X32-NEXT:    addl %edi, %edx
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X32-NEXT:    adcl %esi, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X32-NEXT:    adcl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl %eax, %edx
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %ebp
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 32(%eax), %edi
 ; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -612,7 +609,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    mull %edi
@@ -624,7 +621,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl 36(%eax), %ecx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl %ecx, %esi
-; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %ebx, %eax
@@ -652,7 +649,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ecx, %ebp
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    addl %ebp, %eax
@@ -665,14 +662,14 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %esi, %edi
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 40(%eax), %esi
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
@@ -711,7 +708,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -743,56 +740,53 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %edi, %esi
-; X32-NEXT:    imull %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %eax, %edi
+; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    imull %ebx, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    mull %ebp
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %esi, %edx
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    imull %ebp, %esi
-; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    imull %esi, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    imull %ecx, %edi
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %ebx, %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    addl %edi, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %ecx
 ; X32-NEXT:    setb %bl
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    mull %ebp
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -801,62 +795,60 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    imull %eax, %ecx
 ; X32-NEXT:    movl 56(%edi), %esi
-; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    addl %edx, %esi
-; X32-NEXT:    movl 48(%edi), %edi
-; X32-NEXT:    movl 52(%ebx), %ebp
+; X32-NEXT:    movl 48(%edi), %ecx
+; X32-NEXT:    movl 52(%edi), %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ebx
-; X32-NEXT:    imull %ebp, %ebx
-; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    mull %edi
-; X32-NEXT:    addl %ebx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl %esi, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    imull %ebx, %edx
+; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    imull %ecx, %edi
+; X32-NEXT:    addl %edx, %edi
 ; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    addl %edx, %edi
+; X32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    movl %edx, %ebx
-; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl %esi, %ebp
-; X32-NEXT:    adcl $0, %ebx
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    adcl %esi, %edi
+; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
-; X32-NEXT:    movl %eax, %edi
-; X32-NEXT:    addl %ebp, %edi
+; X32-NEXT:    movl %eax, %ebp
+; X32-NEXT:    addl %esi, %ebp
 ; X32-NEXT:    adcl %ebx, %ecx
 ; X32-NEXT:    setb %bl
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    mull %esi
+; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    movzbl %bl, %ecx
 ; X32-NEXT:    adcl %ecx, %edx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl %edi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -888,10 +880,10 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    addl %esi, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl 32(%ecx), %esi
 ; X32-NEXT:    movl %esi, %eax
@@ -925,8 +917,8 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -959,20 +951,20 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edi
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
-; X32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %ebp
-; X32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ebx
 ; X32-NEXT:    movl %esi, %eax
 ; X32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
@@ -980,60 +972,60 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    addl %ebp, %eax
 ; X32-NEXT:    movl %eax, %ebp
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    addl %esi, %eax
 ; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %edx
-; X32-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-NEXT:    addl %edi, (%esp) # 4-byte Folded Spill
 ; X32-NEXT:    adcl %ecx, %ebp
-; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X32-NEXT:    adcl %eax, %esi
 ; X32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl 48(%ecx), %edi
+; X32-NEXT:    movl 48(%ecx), %ebp
 ; X32-NEXT:    movl %ebx, %esi
-; X32-NEXT:    imull %edi, %esi
-; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    movl %ebp, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X32-NEXT:    mull %ebx
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    movl 52(%ecx), %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    imull %eax, %ebx
+; X32-NEXT:    addl %esi, %ebx
 ; X32-NEXT:    addl %edx, %ebx
 ; X32-NEXT:    movl 56(%ecx), %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %esi
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    mull %ecx
-; X32-NEXT:    addl %esi, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    imull %ecx, %edx
 ; X32-NEXT:    movl 60(%esi), %esi
-; X32-NEXT:    imull %ecx, %esi
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    imull %edi, %esi
+; X32-NEXT:    addl %edx, %esi
+; X32-NEXT:    mull %edi
 ; X32-NEXT:    addl %edx, %esi
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %ebx, %esi
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebp, %eax
-; X32-NEXT:    mull %edi
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ecx
 ; X32-NEXT:    movl %eax, %ebx
 ; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X32-NEXT:    adcl $0, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %edi, %eax
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %edi
@@ -1050,43 +1042,43 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl %esi, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    imull %eax, %edi
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %eax, %ebx
+; X32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl %edi, %ecx
+; X32-NEXT:    addl %edx, %ecx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    imull %eax, %ecx
-; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X32-NEXT:    imull %esi, %edx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X32-NEXT:    imull %edi, %ebp
+; X32-NEXT:    addl %edx, %ebp
 ; X32-NEXT:    mull %edi
+; X32-NEXT:    addl %edx, %ebp
+; X32-NEXT:    addl %ebx, %eax
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    addl %ecx, %edx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X32-NEXT:    imull %ebp, %edi
-; X32-NEXT:    addl %edx, %edi
-; X32-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X32-NEXT:    imull %edi, %ecx
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X32-NEXT:    mull %ebx
-; X32-NEXT:    addl %ecx, %edx
+; X32-NEXT:    adcl %ecx, %ebp
+; X32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl %edi, %eax
+; X32-NEXT:    movl %edi, %ebx
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X32-NEXT:    imull %ebx, %ecx
-; X32-NEXT:    addl %edx, %ecx
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %ebx, %eax
-; X32-NEXT:    mull %esi
-; X32-NEXT:    movl %edx, %ebx
+; X32-NEXT:    mull %ecx
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl %edi, %eax
-; X32-NEXT:    mull %esi
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    mull %ecx
 ; X32-NEXT:    movl %edx, %esi
 ; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    addl %ebx, %ecx
+; X32-NEXT:    addl %edi, %ecx
 ; X32-NEXT:    adcl $0, %esi
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X32-NEXT:    mull %ebp
 ; X32-NEXT:    movl %edx, %ebx
 ; X32-NEXT:    movl %eax, %edi
@@ -1105,9 +1097,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X32-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
 ; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -1179,271 +1171,270 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %rbx
+; X64-NEXT:    movq (%rdi), %r14
 ; X64-NEXT:    movq 8(%rdi), %r9
-; X64-NEXT:    movq 24(%rdi), %r12
-; X64-NEXT:    movq 16(%rdi), %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    movq 8(%rsi), %r11
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq 24(%rdi), %r15
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rdi
+; X64-NEXT:    movq 8(%rsi), %rbx
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rcx, %r10
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r10, %rcx
-; X64-NEXT:    adcq %r8, %r14
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    adcq %r8, %rcx
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r14, %r10
+; X64-NEXT:    addq %rcx, %r10
 ; X64-NEXT:    adcq %rsi, %r13
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r14, %r15
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rbp, %rbx
+; X64-NEXT:    adcq %r14, %rbp
 ; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r9, %rdi
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %rbx, %rbp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rbp, %rcx
 ; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT:    adcq %rcx, %r14
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r11, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %rdi, %rsi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%rdi), %r8
-; X64-NEXT:    movq %r12, %r11
+; X64-NEXT:    movq %r12, %r9
 ; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %r12
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq 24(%rsi), %rsi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq 16(%r12), %rsi
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %r11, %rbp
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq 24(%r9), %rdi
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r15, %r11
-; X64-NEXT:    adcq %rbx, %r9
-; X64-NEXT:    setb %bl
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rbp, %rbx
+; X64-NEXT:    adcq %r15, %r9
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %r9, %rcx
-; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    movzbl %bpl, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %rbp, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %r10, %rcx
+; X64-NEXT:    addq %r10, %r11
 ; X64-NEXT:    adcq %r13, %r15
-; X64-NEXT:    setb %r12b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r9, %rbp
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movzbl %dil, %edi
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r15, %rbx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    adcq %r8, %rcx
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    adcq %rax, %rdx
+; X64-NEXT:    addq %r11, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r12b, %ecx
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r15, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 32(%rcx), %r15
-; X64-NEXT:    imulq %r15, %rsi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq 32(%rcx), %r11
+; X64-NEXT:    imulq %r11, %rdi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq 40(%rcx), %rsi
-; X64-NEXT:    imulq %rsi, %r8
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    movq 40(%rcx), %r8
+; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    addq %rdx, %rsi
 ; X64-NEXT:    movq 48(%rcx), %rax
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rcx, %rdx
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 56(%r11), %r11
-; X64-NEXT:    imulq %rbx, %r11
-; X64-NEXT:    addq %rdx, %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    imulq %r10, %rcx
+; X64-NEXT:    movq 56(%rdx), %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    imulq %rdi, %rbp
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdx, %rbp
 ; X64-NEXT:    addq %r9, %rcx
-; X64-NEXT:    adcq %r8, %r11
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r8
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    adcq %rsi, %rbp
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rdi, %r11
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rbx, %r13
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r11, %rbx
 ; X64-NEXT:    adcq %r9, %r15
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %r15, %r8
-; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq %r11, %r12
+; X64-NEXT:    adcq %rbp, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq 56(%rcx), %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    imulq %rax, %rsi
 ; X64-NEXT:    movq 48(%rcx), %r11
 ; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %r11
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %r11
+; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    addq %rdx, %r11
 ; X64-NEXT:    movq 32(%rdi), %r9
-; X64-NEXT:    movq 40(%rdi), %r15
+; X64-NEXT:    movq 40(%rdi), %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    imulq %r15, %rsi
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    imulq %rdi, %rdx
+; X64-NEXT:    imulq %r9, %r13
+; X64-NEXT:    addq %rdx, %r13
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    imulq %r9, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %r11, %r10
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rdx, %r13
+; X64-NEXT:    addq %r10, %r15
+; X64-NEXT:    adcq %r11, %r13
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r10, %r11
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rbx, %r9
-; X64-NEXT:    adcq %rbp, %rsi
-; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    movzbl %bl, %esi
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %r9
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    adcq %rbp, %r9
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movzbl %r11b, %edi
+; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    addq %r15, %rax
+; X64-NEXT:    adcq %r13, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq %rbx, %rcx
 ; X64-NEXT:    adcq %r8, %rax
 ; X64-NEXT:    adcq %r12, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, (%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 8(%rsi)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 16(%rsi)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, 24(%rsi)
-; X64-NEXT:    movq %rcx, 32(%rsi)
-; X64-NEXT:    movq %r9, 40(%rsi)
-; X64-NEXT:    movq %rax, 48(%rsi)
-; X64-NEXT:    movq %rdx, 56(%rsi)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, (%rdi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, 8(%rdi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, 16(%rdi)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, 24(%rdi)
+; X64-NEXT:    movq %rsi, 32(%rdi)
+; X64-NEXT:    movq %rcx, 40(%rdi)
+; X64-NEXT:    movq %rax, 48(%rdi)
+; X64-NEXT:    movq %rdx, 56(%rdi)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index 1edd1fa4373c8..0bc23af920eed 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -9,8 +9,8 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    imulq %rdi, %rcx
 ; X64-NEXT:    mulq %rdx
-; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    imulq %rsi, %r8
+; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    addq %r8, %rdx
 ; X64-NEXT:    retq
 ;
@@ -30,57 +30,56 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, 4(%ecx)
+; X86-NEXT:    movl %edi, 4(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)

diff  --git a/llvm/test/CodeGen/X86/mul64.ll b/llvm/test/CodeGen/X86/mul64.ll
index 1feed4b207a23..d2afb2c529e42 100644
--- a/llvm/test/CodeGen/X86/mul64.ll
+++ b/llvm/test/CodeGen/X86/mul64.ll
@@ -11,8 +11,8 @@ define i64 @foo(i64 %t, i64 %u) nounwind {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    mull %esi
 ; X32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    addl %ecx, %edx
 ; X32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl %ecx, %esi
 ; X32-NEXT:    addl %esi, %edx
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 9a6cf0b065662..60a2e21dcd03d 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -22,8 +22,8 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    mulq %rbx
 ; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    addq %rdi, %rdx
 ; CHECK-NEXT:    imulq %rcx, %rbx
+; CHECK-NEXT:    addq %rdi, %rbx
 ; CHECK-NEXT:    addq %rdx, %rbx
 ; CHECK-NEXT:    movq %rcx, %rdi
 ; CHECK-NEXT:    sarq $63, %rdi
@@ -32,8 +32,8 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %r9
 ; CHECK-NEXT:    movq %rax, %r10
-; CHECK-NEXT:    addq %r14, %rdx
 ; CHECK-NEXT:    imulq %r9, %rdi
+; CHECK-NEXT:    addq %r14, %rdi
 ; CHECK-NEXT:    addq %rdx, %rdi
 ; CHECK-NEXT:    addq %r8, %r10
 ; CHECK-NEXT:    adcq %rbx, %rdi

diff  --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 43110af64c77e..fb222cd2b2e10 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -352,23 +352,23 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    addl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
@@ -420,20 +420,18 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
-; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
-; X86-POPCNT-NEXT:    addl %ecx, %esi
-; X86-POPCNT-NEXT:    addl %edx, %esi
-; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    addl %edx, %ecx
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    addl %ecx, %edx
+; X86-POPCNT-NEXT:    movl %edx, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
-; X86-POPCNT-NEXT:    popl %esi
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -800,83 +798,80 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
-; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %ebx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %ebx
+; X86-NOSSE-NEXT:    movl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
-; X86-NOSSE-NEXT:    movl %eax, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    addl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %esi, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %esi, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %ebp # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ebp
+; X86-NOSSE-NEXT:    andl %esi, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    addl %ebp, %edi
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl %ebx
+; X86-NOSSE-NEXT:    andl %ecx, %ebx
+; X86-NOSSE-NEXT:    subl %ebx, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    andl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ebx, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %edx, %ebp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    andl %esi, %ebp
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $2, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
+; X86-NOSSE-NEXT:    addl %ecx, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %ebx, %ecx
+; X86-NOSSE-NEXT:    andl %esi, %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -925,21 +920,19 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
-; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
-; X86-POPCNT-NEXT:    addl %ecx, %esi
-; X86-POPCNT-NEXT:    addl %edx, %esi
+; X86-POPCNT-NEXT:    addl %edx, %ecx
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
 ; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT:    movl %esi, (%eax)
-; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    movl %edx, (%eax)
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1230,83 +1223,80 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
-; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %ebx, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %ebx
+; X86-NOSSE-NEXT:    movl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
-; X86-NOSSE-NEXT:    movl %eax, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    addl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %esi, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %esi, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %ebp # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ebp
+; X86-NOSSE-NEXT:    andl %esi, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    addl %ebp, %edi
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl %ebx
+; X86-NOSSE-NEXT:    andl %ecx, %ebx
+; X86-NOSSE-NEXT:    subl %ebx, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    andl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ebx, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebp
+; X86-NOSSE-NEXT:    shrl $4, %ebp
+; X86-NOSSE-NEXT:    addl %edx, %ebp
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOSSE-NEXT:    andl %esi, %ebp
+; X86-NOSSE-NEXT:    imull $16843009, %ebp, %edx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $2, %ebx
+; X86-NOSSE-NEXT:    andl %eax, %ebx
+; X86-NOSSE-NEXT:    addl %ecx, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %ebx, %ecx
+; X86-NOSSE-NEXT:    andl %esi, %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1355,21 +1345,19 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
-; X86-POPCNT-NEXT:    pushl %esi
 ; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
-; X86-POPCNT-NEXT:    addl %ecx, %esi
-; X86-POPCNT-NEXT:    addl %edx, %esi
+; X86-POPCNT-NEXT:    addl %edx, %ecx
+; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    addl %ecx, %edx
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
 ; X86-POPCNT-NEXT:    movl %ecx, 12(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
-; X86-POPCNT-NEXT:    movl %esi, (%eax)
-; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    movl %edx, (%eax)
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:

diff  --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index de34bfb13159c..1705edab9e82b 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -31,10 +31,6 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    imull %edx
 ; CHECK-NEXT:    movl %edx, %edi
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shrl $31, %eax
-; CHECK-NEXT:    sarl $7, %edi
-; CHECK-NEXT:    addl %eax, %edi
 ; CHECK-NEXT:    imull $36525, %esi, %eax # imm = 0x8EAD
 ; CHECK-NEXT:    addl $172251900, %eax # imm = 0xA445AFC
 ; CHECK-NEXT:    movl $1374389535, %edx # imm = 0x51EB851F
@@ -43,7 +39,11 @@ define void @computeJD(ptr) nounwind {
 ; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    sarl $5, %edx
 ; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $31, %eax
 ; CHECK-NEXT:    addl 16(%ebx), %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    sarl $7, %edi
 ; CHECK-NEXT:    addl %edi, %ecx
 ; CHECK-NEXT:    leal 257(%ecx,%edx), %eax
 ; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)

diff  --git a/llvm/test/CodeGen/X86/pr36865.ll b/llvm/test/CodeGen/X86/pr36865.ll
index 580426a069f41..35d644027e1b0 100644
--- a/llvm/test/CodeGen/X86/pr36865.ll
+++ b/llvm/test/CodeGen/X86/pr36865.ll
@@ -13,10 +13,10 @@ define void @main() {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movl (%rax), %ecx
 ; CHECK-NEXT:    addl 0, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl {{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    addl %ecx, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    addl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl %ecx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    imull %eax, %ecx

diff  --git a/llvm/test/CodeGen/X86/reassociate-add.ll b/llvm/test/CodeGen/X86/reassociate-add.ll
index 402cd5166e362..76f5535a3beb1 100644
--- a/llvm/test/CodeGen/X86/reassociate-add.ll
+++ b/llvm/test/CodeGen/X86/reassociate-add.ll
@@ -12,12 +12,12 @@ define void @add8(i8 %x0, i8 %x1, i8 %x2, i8* %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    orb $16, %dil
 ; CHECK-NEXT:    orb $32, %sil
+; CHECK-NEXT:    addb %dil, %sil
 ; CHECK-NEXT:    addb $-8, %dl
 ; CHECK-NEXT:    orb $7, %dl
 ; CHECK-NEXT:    movzbl %dl, %eax
 ; CHECK-NEXT:    imull $100, %eax, %eax
 ; CHECK-NEXT:    addb %sil, %al
-; CHECK-NEXT:    addb %dil, %al
 ; CHECK-NEXT:    movb %al, (%rcx)
 ; CHECK-NEXT:    retq
   %v0 = or i8 %x0, 16
@@ -36,11 +36,11 @@ define void @add16(i16 %x0, i16 %x1, i16 %x2, i16* %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    orl $16, %edi
 ; CHECK-NEXT:    orl $32, %esi
+; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    addl $-8, %edx
 ; CHECK-NEXT:    orl $7, %edx
 ; CHECK-NEXT:    imull $100, %edx, %eax
 ; CHECK-NEXT:    addl %esi, %eax
-; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    movw %ax, (%rcx)
 ; CHECK-NEXT:    retq
   %v0 = or i16 %x0, 16
@@ -59,11 +59,11 @@ define void @add32(i32 %x0, i32 %x1, i32 %x2, i32* %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    orl $16, %edi
 ; CHECK-NEXT:    orl $32, %esi
+; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    addl $-8, %edx
 ; CHECK-NEXT:    orl $7, %edx
 ; CHECK-NEXT:    imull $100, %edx, %eax
 ; CHECK-NEXT:    addl %esi, %eax
-; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    movl %eax, (%rcx)
 ; CHECK-NEXT:    retq
   %v0 = or i32 %x0, 16
@@ -82,11 +82,11 @@ define void @add64(i64 %x0, i64 %x1, i64 %x2, i64* %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    orq $16, %rdi
 ; CHECK-NEXT:    orq $32, %rsi
+; CHECK-NEXT:    addq %rdi, %rsi
 ; CHECK-NEXT:    addq $-8, %rdx
 ; CHECK-NEXT:    orq $7, %rdx
 ; CHECK-NEXT:    imulq $100, %rdx, %rax
 ; CHECK-NEXT:    addq %rsi, %rax
-; CHECK-NEXT:    addq %rdi, %rax
 ; CHECK-NEXT:    movq %rax, (%rcx)
 ; CHECK-NEXT:    retq
   %v0 = or i64 %x0, 16

diff  --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 83802ce434426..83b9460c7dae3 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -212,6 +212,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
@@ -233,16 +234,14 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %edi
@@ -251,26 +250,27 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -294,76 +294,76 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -386,182 +386,176 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %esi
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl %al, %edi
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movzbl %dl, %esi
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
@@ -623,51 +617,49 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    imull %ebx
-; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    adcl %edx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -721,18 +713,17 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %edx, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, %ebx
@@ -841,188 +832,190 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %r9, %r10
-; X64-NEXT:    movq %r8, %rbp
-; X64-NEXT:    movq %rcx, %r12
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %r8
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    andl $1, %ecx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    andl $1, %r12d
-; X64-NEXT:    negq %r12
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    negq %rsi
+; X64-NEXT:    andl $1, %r15d
+; X64-NEXT:    negq %r15
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %rdx, %r15
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %r10, %r14
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %rdx, %r9
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    adcq %rdx, %rcx
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movzbl %dil, %r10d
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl %dil, %r13d
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    adcq %rdx, %r13
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rdi, %rbx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movzbl %dil, %edx
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq %r10, %rdi
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movzbl %r11b, %edx
+; X64-NEXT:    adcq %rdx, %r10
+; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    addq %r11, %rbp
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq %r13, %rbx
-; X64-NEXT:    adcq %r11, %r15
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r8, %rbp
+; X64-NEXT:    adcq %rax, %r11
 ; X64-NEXT:    setb %al
-; X64-NEXT:    addq %r8, %r15
-; X64-NEXT:    movzbl %al, %r8d
-; X64-NEXT:    adcq %rdx, %r8
-; X64-NEXT:    addq %r13, %rdi
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    movzbl %al, %r14d
+; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    addq %r8, %rdi
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %rbx
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    addq %r9, %r15
-; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdx, %r11
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    adcq %rdx, %rdi
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    addq %rax, %rdi
-; X64-NEXT:    movzbl %r9b, %esi
-; X64-NEXT:    adcq %rdx, %rsi
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    adcq %r8, %r11
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    addq %r14, %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    adcq %r10, %rbp
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    adcq %r13, %r14
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rdx, %rbx
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    adcq %rdx, %r10
+; X64-NEXT:    setb %cl
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rdx, %rcx
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    adcq %r14, %rbx
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    adcq %rax, %r10
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    addq %r9, %r8
-; X64-NEXT:    adcq %r14, %r10
+; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    setb %al
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    movzbl %al, %edi
+; X64-NEXT:    adcq %r8, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %r12, %rax
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    imulq %r12, %rbp
-; X64-NEXT:    addq %rax, %rbp
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    imulq %rcx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r9, %r12
+; X64-NEXT:    imulq %r15, %rax
+; X64-NEXT:    imulq %r15, %r12
 ; X64-NEXT:    addq %rax, %r12
-; X64-NEXT:    adcq %rdx, %rbp
-; X64-NEXT:    addq %r10, %r12
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    movq %r13, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq %rdx, %r12
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    imulq %rsi
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r9, %r8
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    adcq %rdx, %r12
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq %rdi, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    movq %r13, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    addq %rdi, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    adcq %rdx, %rcx
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    movzbl %r9b, %edi
+; X64-NEXT:    adcq %rdx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    imulq %rsi, %rdx
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    addq %rdx, %rsi
-; X64-NEXT:    movzbl %r10b, %r10d
-; X64-NEXT:    adcq %r9, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r9
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    addq %r9, %rcx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    adcq %r10, %rcx
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    adcq %rdi, %rsi
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r8, %r14
-; X64-NEXT:    adcq %r12, %rax
-; X64-NEXT:    adcq %rbp, %rcx
-; X64-NEXT:    addq %r15, %r13
-; X64-NEXT:    adcq %r11, %r14
-; X64-NEXT:    adcq %rdi, %rax
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    xorq %rdx, %r14
-; X64-NEXT:    orq %rcx, %r14
-; X64-NEXT:    xorq %rdx, %rax
-; X64-NEXT:    orq %r14, %rax
-; X64-NEXT:    xorq %r13, %rdx
-; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    adcq %r14, %r15
+; X64-NEXT:    adcq %r8, %rax
+; X64-NEXT:    adcq %r12, %rsi
+; X64-NEXT:    addq %r11, %r13
+; X64-NEXT:    adcq %rbx, %r15
+; X64-NEXT:    adcq %r10, %rax
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    xorq %rcx, %r15
+; X64-NEXT:    orq %rsi, %r15
+; X64-NEXT:    xorq %rcx, %rax
+; X64-NEXT:    orq %r15, %rax
+; X64-NEXT:    xorq %r13, %rcx
+; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movl %eax, %esi
 ; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorq %rcx, %rbx
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    orq %rbx, %rcx
-; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    negq %rdx
+; X64-NEXT:    xorq %rdx, %rbp
+; X64-NEXT:    xorq %rax, %rdx
+; X64-NEXT:    orq %rbp, %rdx
+; X64-NEXT:    orq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, 8(%rax)

diff  --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index 8c2b945d6a8ce..fd25842180922 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -231,8 +231,8 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 996601ed3be64..0532916b1e4ca 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 32
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -383,62 +383,62 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    imull %ebx, %edi
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %ebp, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    xorl %edi, %edx
 ; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    notl %ecx
-; X86-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel %ebp, %esi
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ebx, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 367ca660cda14..fc40c539f37c7 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -23,8 +23,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %rdx
 ; X64-NEXT:    imulq %rcx, %r14
+; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    addq %rdx, %r14
 ; X64-NEXT:    movq %rcx, %rdi
 ; X64-NEXT:    sarq $63, %rdi
@@ -33,8 +33,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r15, %rdx
 ; X64-NEXT:    imulq %r10, %rdi
+; X64-NEXT:    addq %r15, %rdi
 ; X64-NEXT:    addq %rdx, %rdi
 ; X64-NEXT:    addq %r9, %r11
 ; X64-NEXT:    adcq %r14, %rdi
@@ -84,8 +84,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 76
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 72
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -102,20 +102,19 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -127,198 +126,197 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    imull %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    imull %esi, %ebx
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    imull %esi, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    addl %eax, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl (%esp), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    imull %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -326,7 +324,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -360,8 +358,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %r8, %rbx
 ; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
@@ -372,31 +371,29 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rsi, %r10
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rsi, %r8
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r10, %r14
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r8, %r10
 ; X64-NEXT:    adcq %rcx, %r12
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %ecx
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r12, %rbx
-; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r12, %r14
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r8, %r13
@@ -405,69 +402,69 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %r9, %rsi
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %r10
+; X64-NEXT:    adcq %r12, %r11
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r15, %r9
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r10, %r8
+; X64-NEXT:    addq %r11, %r8
 ; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %rbp
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %rbp
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r10, %rbp
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
+; X64-NEXT:    movq %rdi, %rcx
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    movq %r9, %rsi
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r10, %r9
+; X64-NEXT:    addq %r11, %r9
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    adcq %r13, %r11
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    adcq %r13, %r10
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r11, %r13
+; X64-NEXT:    addq %r10, %r13
 ; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rbp, %rdi
+; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    addq %r8, %rdi
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rbp, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    addq %rbx, %r13
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
-; X64-NEXT:    setb %cl
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %r14, %r13
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Folded Reload
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r8
@@ -476,118 +473,117 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %r8, %rax
 ; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r8, %rsi
 ; X64-NEXT:    adcq %rdi, %r9
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r9, %r14
 ; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq %r13, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r10, %rsi
+; X64-NEXT:    addq %r13, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r11, %rsi
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    movq %r10, %r13
 ; X64-NEXT:    sarq $63, %r13
 ; X64-NEXT:    movq %r13, %rcx
 ; X64-NEXT:    imulq %r12, %rcx
 ; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    imulq %r13, %r15
+; X64-NEXT:    addq %rcx, %r15
 ; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    movq %r13, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    imulq %rdi, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    imulq %rsi, %rcx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    imulq %r13, %rbx
+; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    imulq %r13, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    addq %rdx, %rbx
 ; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    adcq %r15, %rbx
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %r9, %r15
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    addq %r11, %r15
 ; X64-NEXT:    adcq %r9, %r13
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    addq %rax, %r13
 ; X64-NEXT:    movzbl %cl, %r9d
 ; X64-NEXT:    adcq %rdx, %r9
 ; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    adcq %rbx, %r9
 ; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    imulq %r12, %r8
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r12, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rbx
 ; X64-NEXT:    imulq %r12, %rbx
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    addq %rcx, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    imulq %r12, %rdx
 ; X64-NEXT:    imulq %r12, %r10
 ; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    adcq %rbx, %rcx
 ; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %r10, %rbx
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    setb %cl
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rdx, %r10
+; X64-NEXT:    setb %r12b
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    movzbl %r12b, %eax
 ; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %r8, %r11
-; X64-NEXT:    adcq %r10, %rax
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT:    addq %r8, %r10
+; X64-NEXT:    adcq %rcx, %rax
+; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    adcq %r15, %rbx
-; X64-NEXT:    adcq %r13, %r11
+; X64-NEXT:    adcq %r13, %r10
 ; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    adcq %r14, %r10
 ; X64-NEXT:    adcq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    xorq %rcx, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    xorq %rdx, %rax
+; X64-NEXT:    xorq %rdx, %rbx
 ; X64-NEXT:    orq %rax, %rbx
-; X64-NEXT:    xorq %rcx, %r11
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    orq %r11, %rcx
-; X64-NEXT:    orq %rbx, %rcx
+; X64-NEXT:    xorq %rdx, %r10
+; X64-NEXT:    xorq %rsi, %rdx
+; X64-NEXT:    orq %r10, %rdx
+; X64-NEXT:    orq %rbx, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rdx, 24(%rax)
+; X64-NEXT:    movq %rcx, 24(%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -613,42 +609,43 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $156, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 176
+; X86-NEXT:    subl $152, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 172
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -659,354 +656,353 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
@@ -1021,7 +1017,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
@@ -1063,103 +1059,103 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1169,14 +1165,14 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1187,89 +1183,88 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %eax, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    imull %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -1287,26 +1282,23 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    addl %ebx, %ecx
@@ -1317,35 +1309,36 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -1354,189 +1347,187 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    imull %esi, %eax
 ; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %dl
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    xorl %edi, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    xorl %edi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    xorl %edi, %ebp
-; X86-NEXT:    orl %eax, %ebp
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    xorl %esi, %ebx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 28(%eax)
+; X86-NEXT:    movl %ebp, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1552,7 +1543,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $156, %esp
+; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 0226052402cb8..7a3e48572b8e2 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -196,7 +196,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -207,36 +207,37 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32-NEXT:    subl %esi, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    subl %ecx, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imull %eax, %ecx
+; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %ebx, %eax
-; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull %ebx, %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    subl %ebp, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, %ecx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull %edx, %edi
 ; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT:    addl %esi, %edi
-; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    imull %eax, %edx
-; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %ebp, %eax
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    addl %eax, %edi
 ; WIN32-NEXT:    addl %ecx, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
@@ -270,18 +271,18 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN64-NEXT:    # kill: def $r11d killed $r11d killed $r11
 ; WIN64-NEXT:    subl %r12d, %r11d
 ; WIN64-NEXT:    imull %edx, %r11d
-; WIN64-NEXT:    addl %r9d, %r11d
 ; WIN64-NEXT:    leal (%r14,%r15), %edx
-; WIN64-NEXT:    movl %r14d, %r9d
-; WIN64-NEXT:    subl %r15d, %r9d
-; WIN64-NEXT:    imull %esi, %r9d
-; WIN64-NEXT:    addl %r11d, %r9d
+; WIN64-NEXT:    # kill: def $r14d killed $r14d killed $r14
+; WIN64-NEXT:    subl %r15d, %r14d
+; WIN64-NEXT:    imull %esi, %r14d
+; WIN64-NEXT:    addl %r11d, %r14d
 ; WIN64-NEXT:    addl %ecx, %eax
 ; WIN64-NEXT:    imull %r8d, %eax
 ; WIN64-NEXT:    imull %ebx, %r10d
-; WIN64-NEXT:    addl %r10d, %eax
 ; WIN64-NEXT:    imull %edi, %edx
+; WIN64-NEXT:    addl %r10d, %edx
 ; WIN64-NEXT:    addl %edx, %eax
+; WIN64-NEXT:    addl %r14d, %eax
 ; WIN64-NEXT:    addl %r9d, %eax
 ; WIN64-NEXT:    popq %rbx
 ; WIN64-NEXT:    retq
@@ -311,19 +312,19 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; LINUXOSX-NEXT:    leal (%r13,%r14), %r11d
 ; LINUXOSX-NEXT:    movl %r13d, %r12d
 ; LINUXOSX-NEXT:    subl %r14d, %r12d
+; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
 ; LINUXOSX-NEXT:    imull %edx, %r12d
-; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; LINUXOSX-NEXT:    addl %r9d, %r12d
-; LINUXOSX-NEXT:    movl %r15d, %r9d
-; LINUXOSX-NEXT:    subl %edx, %r9d
-; LINUXOSX-NEXT:    imull %esi, %r9d
-; LINUXOSX-NEXT:    addl %r12d, %r9d
+; LINUXOSX-NEXT:    movl %r15d, %edx
+; LINUXOSX-NEXT:    subl %r14d, %edx
+; LINUXOSX-NEXT:    imull %esi, %edx
+; LINUXOSX-NEXT:    addl %r12d, %edx
 ; LINUXOSX-NEXT:    addl %ecx, %eax
 ; LINUXOSX-NEXT:    imull %r8d, %eax
 ; LINUXOSX-NEXT:    imull %r10d, %r11d
-; LINUXOSX-NEXT:    addl %r11d, %eax
-; LINUXOSX-NEXT:    addl %r15d, %edx
-; LINUXOSX-NEXT:    imull %edi, %edx
+; LINUXOSX-NEXT:    addl %r15d, %r14d
+; LINUXOSX-NEXT:    imull %edi, %r14d
+; LINUXOSX-NEXT:    addl %r11d, %r14d
+; LINUXOSX-NEXT:    addl %r14d, %eax
 ; LINUXOSX-NEXT:    addl %edx, %eax
 ; LINUXOSX-NEXT:    addl %r9d, %eax
 ; LINUXOSX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll
index b5b9ce95a46ba..833b13e796787 100644
--- a/llvm/test/CodeGen/X86/stack-clash-large.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-large.ll
@@ -98,13 +98,13 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X64-NEXT:    .cfi_def_cfa_offset 71888
 ; CHECK-X64-NEXT:    .cfi_offset %rax, -16
 ; CHECK-X64-NEXT:    movl 71888(%rsp), %eax
-; CHECK-X64-NEXT:    addl %esi, %edi
 ; CHECK-X64-NEXT:    addl %ecx, %edx
-; CHECK-X64-NEXT:    addl %edi, %edx
-; CHECK-X64-NEXT:    addl %r9d, %r8d
 ; CHECK-X64-NEXT:    addl 71896(%rsp), %eax
+; CHECK-X64-NEXT:    addl %esi, %edx
+; CHECK-X64-NEXT:    addl %r9d, %eax
 ; CHECK-X64-NEXT:    addl %r8d, %eax
 ; CHECK-X64-NEXT:    addl %edx, %eax
+; CHECK-X64-NEXT:    addl %edi, %eax
 ; CHECK-X64-NEXT:    movl %eax, 264(%rsp)
 ; CHECK-X64-NEXT:    movl %eax, 28664(%rsp)
 ; CHECK-X64-NEXT:    addq $71872, %rsp # imm = 0x118C0
@@ -141,16 +141,16 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X86-NEXT:    .cfi_offset %edx, -12
 ; CHECK-X86-NEXT:    .cfi_offset %esi, -8
 ; CHECK-X86-NEXT:    movl 72056(%esp), %eax
-; CHECK-X86-NEXT:    movl 72048(%esp), %edx
-; CHECK-X86-NEXT:    movl 72040(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72048(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72040(%esp), %edx
 ; CHECK-X86-NEXT:    movl 72032(%esp), %esi
 ; CHECK-X86-NEXT:    addl 72036(%esp), %esi
-; CHECK-X86-NEXT:    addl 72044(%esp), %ecx
-; CHECK-X86-NEXT:    addl %esi, %ecx
-; CHECK-X86-NEXT:    addl 72052(%esp), %edx
+; CHECK-X86-NEXT:    addl 72044(%esp), %edx
+; CHECK-X86-NEXT:    addl 72052(%esp), %ecx
 ; CHECK-X86-NEXT:    addl 72060(%esp), %eax
-; CHECK-X86-NEXT:    addl %edx, %eax
 ; CHECK-X86-NEXT:    addl %ecx, %eax
+; CHECK-X86-NEXT:    addl %edx, %eax
+; CHECK-X86-NEXT:    addl %esi, %eax
 ; CHECK-X86-NEXT:    movl %eax, 392(%esp)
 ; CHECK-X86-NEXT:    movl %eax, 28792(%esp)
 ; CHECK-X86-NEXT:    addl $72012, %esp # imm = 0x1194C
@@ -184,13 +184,13 @@ define void @push_before_probe(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 71888
 ; CHECK-X32-NEXT:    .cfi_offset %rax, -16
 ; CHECK-X32-NEXT:    movl 71888(%esp), %eax
-; CHECK-X32-NEXT:    addl %esi, %edi
 ; CHECK-X32-NEXT:    addl %ecx, %edx
-; CHECK-X32-NEXT:    addl %edi, %edx
-; CHECK-X32-NEXT:    addl %r9d, %r8d
 ; CHECK-X32-NEXT:    addl 71896(%esp), %eax
+; CHECK-X32-NEXT:    addl %esi, %edx
+; CHECK-X32-NEXT:    addl %r9d, %eax
 ; CHECK-X32-NEXT:    addl %r8d, %eax
 ; CHECK-X32-NEXT:    addl %edx, %eax
+; CHECK-X32-NEXT:    addl %edi, %eax
 ; CHECK-X32-NEXT:    movl %eax, 264(%esp)
 ; CHECK-X32-NEXT:    movl %eax, 28664(%esp)
 ; CHECK-X32-NEXT:    addl $71872, %esp # imm = 0x118C0

diff  --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index 2b02656071a7b..84ace4b5b0950 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -442,12 +442,12 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edx, %r12d
 ; CHECK-NEXT:    movl %ecx, %r13d
-; CHECK-NEXT:    movl %r8d, %ebp
-; CHECK-NEXT:    movl %r9d, %r14d
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movl %r9d, %r15d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
@@ -472,11 +472,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
 ; CHECK-NEXT:  Ltmp13:
-; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    addq %r12, %rbx
 ; CHECK-NEXT:    addq %r13, %rbx
-; CHECK-NEXT:    addq %rbp, %rbx
 ; CHECK-NEXT:    addq %r14, %rbx
+; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
@@ -517,6 +516,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rbp, %rbx
 ; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    addq $168, %rsp
 ; CHECK-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index 9338da5de8001..d6fef28bc8c66 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -554,12 +554,12 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    movl %edi, %ebx
-; CHECK-NEXT:    movl %esi, %r15d
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    movl %esi, %ebx
 ; CHECK-NEXT:    movl %edx, %r12d
 ; CHECK-NEXT:    movl %ecx, %r13d
-; CHECK-NEXT:    movl %r8d, %ebp
-; CHECK-NEXT:    movl %r9d, %r14d
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movl %r9d, %r15d
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
@@ -584,11 +584,10 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    callq _bar ## 160-byte Folded Reload
 ; CHECK-NEXT:  Ltmp14:
-; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    addq %r12, %rbx
 ; CHECK-NEXT:    addq %r13, %rbx
-; CHECK-NEXT:    addq %rbp, %rbx
 ; CHECK-NEXT:    addq %r14, %rbx
+; CHECK-NEXT:    addq %r15, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
@@ -629,6 +628,7 @@ define i64 @test11(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %
 ; CHECK-NEXT:    addq %rax, %rbx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; CHECK-NEXT:    addq %rax, %rbx
+; CHECK-NEXT:    addq %rbp, %rbx
 ; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    addq $168, %rsp
 ; CHECK-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll
index d29548d488e63..24c1509b2325e 100644
--- a/llvm/test/CodeGen/X86/swift-return.ll
+++ b/llvm/test/CodeGen/X86/swift-return.ll
@@ -147,9 +147,11 @@ define dso_local i32 @test3(i32 %key) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    callq gen3 at PLT
-; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
+; CHECK-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    addl %r8d, %ecx
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    addl %r8d, %eax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -358,9 +360,9 @@ define swiftcc { double, i64 } @test6() #0 {
 ; CHECK-NEXT:    addsd %xmm1, %xmm0
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
 ; CHECK-NEXT:    addsd %xmm3, %xmm0
-; CHECK-NEXT:    addq %rdx, %rax
+; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    addq %r8, %rcx
 ; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    addq %r8, %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index 14186537daea5..b11c9de00886f 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -31,11 +31,13 @@ define i32 @test1(i32 %X) nounwind {
 define i32 @test2(i32 inreg %a, i32 inreg %b, i32 %c, i32 %d) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    ## kill: def $ecx killed $ecx def $rcx
+; CHECK-NEXT:    ## kill: def $edx killed $edx def $rdx
 ; CHECK-NEXT:    ## kill: def $esi killed $esi def $rsi
 ; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    leal (%rdi,%rsi), %eax
-; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edi, %esi
+; CHECK-NEXT:    leal (%rdx,%rcx), %eax
+; CHECK-NEXT:    addl %esi, %eax
 ; CHECK-NEXT:    retq
 entry:
  %add = add i32 %b, %a

diff  --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index b516d69c676df..fcebba2e90fef 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -93,7 +93,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
@@ -106,7 +106,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -142,12 +142,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
@@ -176,13 +176,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
@@ -201,123 +201,120 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl (%esp), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %esi, %edx
@@ -368,7 +365,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
@@ -380,7 +377,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -428,23 +425,20 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -452,41 +446,44 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %esi, %ecx
@@ -501,9 +498,9 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %esi, 8(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 16(%edx)
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 16(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 20(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 24(%edx)
@@ -530,16 +527,16 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r8, %r11
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %r8, %rbp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %r10, %rbp
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rbx, %r15
@@ -551,13 +548,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    addq %r15, %rbx
 ; X64-NEXT:    adcq %r14, %r12
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r8d
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movzbl %al, %r10d
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r12, %r13
-; X64-NEXT:    adcq %r8, %r15
+; X64-NEXT:    adcq %r10, %r15
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r12
@@ -565,13 +562,13 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r12, %r8
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r12, %r10
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %r8, %r12
+; X64-NEXT:    addq %r10, %r12
 ; X64-NEXT:    adcq %rbp, %rdx
 ; X64-NEXT:    imulq %r9, %r11
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
@@ -580,30 +577,30 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X64-NEXT:    adcq %rdx, %r11
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r8, %rbp
+; X64-NEXT:    addq %r10, %rbp
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    adcq %r13, %rdx
-; X64-NEXT:    imulq %r8, %r10
-; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    imulq %r10, %r8
+; X64-NEXT:    addq %rdx, %r8
 ; X64-NEXT:    addq %r14, %r15
 ; X64-NEXT:    adcq %r12, %rax
-; X64-NEXT:    adcq %r11, %r10
+; X64-NEXT:    adcq %r11, %r8
 ; X64-NEXT:    imulq %r9, %rcx
-; X64-NEXT:    addq %r10, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    imulq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    addq %rdx, %rsi
+; X64-NEXT:    addq %r8, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
 ; X64-NEXT:    movq %rbx, 8(%rdi)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload

diff  --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index cb4bdd1ede75c..dca010064b587 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -185,8 +185,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
index 671d7c21013de..7a5198dd6a0fa 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll
@@ -295,19 +295,21 @@ define i1 @t8_3_2(i8 %X) nounwind {
 define i1 @t64_3_2(i64 %X) nounwind {
 ; X86-LABEL: t64_3_2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull $-1431655765, {{[0-9]+}}(%esp), %esi # imm = 0xAAAAAAAB
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl $-1431655766, %eax # imm = 0xAAAAAAAA
-; X86-NEXT:    adcl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT:    adcl $-1431655766, %esi # imm = 0xAAAAAAAA
 ; X86-NEXT:    cmpl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    sbbl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    sbbl $1431655765, %esi # imm = 0x55555555
 ; X86-NEXT:    setb %al
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t64_3_2:

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index dbec86755a969..7bb7c9b481d39 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3309,8 +3309,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    movq %r14, %rax
 ; SSE2-NEXT:    mulq %r12
 ; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    addq %rbx, %rdx
 ; SSE2-NEXT:    imulq %r9, %r12
+; SSE2-NEXT:    addq %rbx, %r12
 ; SSE2-NEXT:    addq %rdx, %r12
 ; SSE2-NEXT:    movq %r9, %rbx
 ; SSE2-NEXT:    sarq $63, %rbx
@@ -3319,8 +3319,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    movq %rbx, %rax
 ; SSE2-NEXT:    mulq %r10
 ; SSE2-NEXT:    movq %rax, %r15
-; SSE2-NEXT:    addq %r13, %rdx
 ; SSE2-NEXT:    imulq %r10, %rbx
+; SSE2-NEXT:    addq %r13, %rbx
 ; SSE2-NEXT:    addq %rdx, %rbx
 ; SSE2-NEXT:    addq %rdi, %r15
 ; SSE2-NEXT:    adcq %r12, %rbx
@@ -3363,8 +3363,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    movq %rsi, %rax
 ; SSE2-NEXT:    mulq %rbx
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %r10, %rdx
 ; SSE2-NEXT:    imulq %rbp, %rbx
+; SSE2-NEXT:    addq %r10, %rbx
 ; SSE2-NEXT:    addq %rdx, %rbx
 ; SSE2-NEXT:    movq %rbp, %r10
 ; SSE2-NEXT:    sarq $63, %r10
@@ -3373,8 +3373,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %r8
 ; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    addq %r14, %rdx
 ; SSE2-NEXT:    imulq %r8, %r10
+; SSE2-NEXT:    addq %r14, %r10
 ; SSE2-NEXT:    addq %rdx, %r10
 ; SSE2-NEXT:    addq %r9, %r11
 ; SSE2-NEXT:    adcq %rbx, %r10
@@ -3445,8 +3445,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    movq %r14, %rax
 ; SSSE3-NEXT:    mulq %r12
 ; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    addq %rbx, %rdx
 ; SSSE3-NEXT:    imulq %r9, %r12
+; SSSE3-NEXT:    addq %rbx, %r12
 ; SSSE3-NEXT:    addq %rdx, %r12
 ; SSSE3-NEXT:    movq %r9, %rbx
 ; SSSE3-NEXT:    sarq $63, %rbx
@@ -3455,8 +3455,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    movq %rbx, %rax
 ; SSSE3-NEXT:    mulq %r10
 ; SSSE3-NEXT:    movq %rax, %r15
-; SSSE3-NEXT:    addq %r13, %rdx
 ; SSSE3-NEXT:    imulq %r10, %rbx
+; SSSE3-NEXT:    addq %r13, %rbx
 ; SSSE3-NEXT:    addq %rdx, %rbx
 ; SSSE3-NEXT:    addq %rdi, %r15
 ; SSSE3-NEXT:    adcq %r12, %rbx
@@ -3499,8 +3499,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    movq %rsi, %rax
 ; SSSE3-NEXT:    mulq %rbx
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %r10, %rdx
 ; SSSE3-NEXT:    imulq %rbp, %rbx
+; SSSE3-NEXT:    addq %r10, %rbx
 ; SSSE3-NEXT:    addq %rdx, %rbx
 ; SSSE3-NEXT:    movq %rbp, %r10
 ; SSSE3-NEXT:    sarq $63, %r10
@@ -3509,8 +3509,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %r8
 ; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    addq %r14, %rdx
 ; SSSE3-NEXT:    imulq %r8, %r10
+; SSSE3-NEXT:    addq %r14, %r10
 ; SSSE3-NEXT:    addq %rdx, %r10
 ; SSSE3-NEXT:    addq %r9, %r11
 ; SSSE3-NEXT:    adcq %rbx, %r10
@@ -3581,8 +3581,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    movq %r14, %rax
 ; SSE41-NEXT:    mulq %r12
 ; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    addq %rbx, %rdx
 ; SSE41-NEXT:    imulq %r9, %r12
+; SSE41-NEXT:    addq %rbx, %r12
 ; SSE41-NEXT:    addq %rdx, %r12
 ; SSE41-NEXT:    movq %r9, %rbx
 ; SSE41-NEXT:    sarq $63, %rbx
@@ -3591,8 +3591,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    movq %rbx, %rax
 ; SSE41-NEXT:    mulq %r10
 ; SSE41-NEXT:    movq %rax, %r15
-; SSE41-NEXT:    addq %r13, %rdx
 ; SSE41-NEXT:    imulq %r10, %rbx
+; SSE41-NEXT:    addq %r13, %rbx
 ; SSE41-NEXT:    addq %rdx, %rbx
 ; SSE41-NEXT:    addq %rdi, %r15
 ; SSE41-NEXT:    adcq %r12, %rbx
@@ -3635,8 +3635,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    movq %rsi, %rax
 ; SSE41-NEXT:    mulq %rbx
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %r10, %rdx
 ; SSE41-NEXT:    imulq %rbp, %rbx
+; SSE41-NEXT:    addq %r10, %rbx
 ; SSE41-NEXT:    addq %rdx, %rbx
 ; SSE41-NEXT:    movq %rbp, %r10
 ; SSE41-NEXT:    sarq $63, %r10
@@ -3645,8 +3645,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %r8
 ; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    addq %r14, %rdx
 ; SSE41-NEXT:    imulq %r8, %r10
+; SSE41-NEXT:    addq %r14, %r10
 ; SSE41-NEXT:    addq %rdx, %r10
 ; SSE41-NEXT:    addq %r9, %r11
 ; SSE41-NEXT:    adcq %rbx, %r10
@@ -3716,8 +3716,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    movq %r14, %rax
 ; AVX-NEXT:    mulq %r12
 ; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    addq %rbx, %rdx
 ; AVX-NEXT:    imulq %r9, %r12
+; AVX-NEXT:    addq %rbx, %r12
 ; AVX-NEXT:    addq %rdx, %r12
 ; AVX-NEXT:    movq %r9, %rbx
 ; AVX-NEXT:    sarq $63, %rbx
@@ -3726,8 +3726,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    movq %rbx, %rax
 ; AVX-NEXT:    mulq %r10
 ; AVX-NEXT:    movq %rax, %r15
-; AVX-NEXT:    addq %r13, %rdx
 ; AVX-NEXT:    imulq %r10, %rbx
+; AVX-NEXT:    addq %r13, %rbx
 ; AVX-NEXT:    addq %rdx, %rbx
 ; AVX-NEXT:    addq %rdi, %r15
 ; AVX-NEXT:    adcq %r12, %rbx
@@ -3770,8 +3770,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    movq %rsi, %rax
 ; AVX-NEXT:    mulq %rbx
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %r10, %rdx
 ; AVX-NEXT:    imulq %rbp, %rbx
+; AVX-NEXT:    addq %r10, %rbx
 ; AVX-NEXT:    addq %rdx, %rbx
 ; AVX-NEXT:    movq %rbp, %r10
 ; AVX-NEXT:    sarq $63, %r10
@@ -3780,8 +3780,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %r8
 ; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    addq %r14, %rdx
 ; AVX-NEXT:    imulq %r8, %r10
+; AVX-NEXT:    addq %r14, %r10
 ; AVX-NEXT:    addq %rdx, %r10
 ; AVX-NEXT:    addq %r9, %r11
 ; AVX-NEXT:    adcq %rbx, %r10
@@ -3851,8 +3851,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    movq %r15, %rax
 ; AVX512F-NEXT:    mulq %r12
 ; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    addq %rbx, %rdx
 ; AVX512F-NEXT:    imulq %rsi, %r12
+; AVX512F-NEXT:    addq %rbx, %r12
 ; AVX512F-NEXT:    addq %rdx, %r12
 ; AVX512F-NEXT:    movq %rsi, %rbx
 ; AVX512F-NEXT:    sarq $63, %rbx
@@ -3861,8 +3861,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    movq %rbx, %rax
 ; AVX512F-NEXT:    mulq %r10
 ; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    addq %r13, %rdx
 ; AVX512F-NEXT:    imulq %r10, %rbx
+; AVX512F-NEXT:    addq %r13, %rbx
 ; AVX512F-NEXT:    addq %rdx, %rbx
 ; AVX512F-NEXT:    addq %rcx, %r14
 ; AVX512F-NEXT:    adcq %r12, %rbx
@@ -3905,8 +3905,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    movq %r8, %rax
 ; AVX512F-NEXT:    mulq %rsi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r11, %rdx
 ; AVX512F-NEXT:    imulq %rbp, %rsi
+; AVX512F-NEXT:    addq %r11, %rsi
 ; AVX512F-NEXT:    addq %rdx, %rsi
 ; AVX512F-NEXT:    movq %rbp, %r11
 ; AVX512F-NEXT:    sarq $63, %r11
@@ -3915,8 +3915,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    movq %r11, %rax
 ; AVX512F-NEXT:    mulq %rdi
 ; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    addq %r14, %rdx
 ; AVX512F-NEXT:    imulq %rdi, %r11
+; AVX512F-NEXT:    addq %r14, %r11
 ; AVX512F-NEXT:    addq %rdx, %r11
 ; AVX512F-NEXT:    addq %r10, %rbx
 ; AVX512F-NEXT:    adcq %rsi, %r11
@@ -3987,8 +3987,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    movq %r15, %rax
 ; AVX512BW-NEXT:    mulq %r12
 ; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    addq %rbx, %rdx
 ; AVX512BW-NEXT:    imulq %rsi, %r12
+; AVX512BW-NEXT:    addq %rbx, %r12
 ; AVX512BW-NEXT:    addq %rdx, %r12
 ; AVX512BW-NEXT:    movq %rsi, %rbx
 ; AVX512BW-NEXT:    sarq $63, %rbx
@@ -3997,8 +3997,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    movq %rbx, %rax
 ; AVX512BW-NEXT:    mulq %r10
 ; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    addq %r13, %rdx
 ; AVX512BW-NEXT:    imulq %r10, %rbx
+; AVX512BW-NEXT:    addq %r13, %rbx
 ; AVX512BW-NEXT:    addq %rdx, %rbx
 ; AVX512BW-NEXT:    addq %rcx, %r14
 ; AVX512BW-NEXT:    adcq %r12, %rbx
@@ -4041,8 +4041,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    movq %r8, %rax
 ; AVX512BW-NEXT:    mulq %rsi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r11, %rdx
 ; AVX512BW-NEXT:    imulq %rbp, %rsi
+; AVX512BW-NEXT:    addq %r11, %rsi
 ; AVX512BW-NEXT:    addq %rdx, %rsi
 ; AVX512BW-NEXT:    movq %rbp, %r11
 ; AVX512BW-NEXT:    sarq $63, %r11
@@ -4051,8 +4051,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    movq %r11, %rax
 ; AVX512BW-NEXT:    mulq %rdi
 ; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    addq %r14, %rdx
 ; AVX512BW-NEXT:    imulq %rdi, %r11
+; AVX512BW-NEXT:    addq %r14, %r11
 ; AVX512BW-NEXT:    addq %rdx, %r11
 ; AVX512BW-NEXT:    addq %r10, %rbx
 ; AVX512BW-NEXT:    adcq %rsi, %r11

diff  --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
index 7ddff59c26b0e..943424171d662 100644
--- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
+++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll
@@ -46,16 +46,16 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
 ; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
 ; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
 ; X86-NEXT:    kmovw %k0, %edi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload
-; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload
-; X86-NEXT:    kmovw %k2, %edi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    kmovw %k1, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    kmovw %k1, %eax
 ; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movw %ax, (%esi)
+; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload
+; X86-NEXT:    kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload
+; X86-NEXT:    kmovw %k0, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movw %dx, (%esi)
 ; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -104,11 +104,11 @@ define void @test(<16 x i32> %a0, <16 x i32> %b0, <16 x i32> %a1, <16 x i32> %b1
 ; X64-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
 ; X64-NEXT:    kmovw %k0, %edi
 ; X64-NEXT:    kmovw %k1, %r8d
-; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    addl %ecx, %edx
 ; X64-NEXT:    addl %r8d, %eax
 ; X64-NEXT:    addl %esi, %eax
 ; X64-NEXT:    addl %edx, %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
 ; X64-NEXT:    leaq -8(%rbp), %rsp
 ; X64-NEXT:    popq %rbx

diff  --git a/llvm/test/CodeGen/X86/win-smallparams.ll b/llvm/test/CodeGen/X86/win-smallparams.ll
index 191f1642b10dc..63ac6e0582add 100644
--- a/llvm/test/CodeGen/X86/win-smallparams.ll
+++ b/llvm/test/CodeGen/X86/win-smallparams.ll
@@ -57,14 +57,13 @@ entry:
 ; WIN32: calll _manyargs
 
 ; WIN32-LABEL: _manyargs:
-; WIN32: pushl %ebx
 ; WIN32: pushl %edi
 ; WIN32: pushl %esi
-; WIN32-DAG: movsbl 16(%esp),
-; WIN32-DAG: movswl 20(%esp),
-; WIN32-DAG: movzbl 24(%esp),
-; WIN32-DAG: movzwl 28(%esp),
-; WIN32-DAG: movzbl 32(%esp),
-; WIN32-DAG: movzwl 36(%esp),
+; WIN32-DAG: movzwl 32(%esp),
+; WIN32-DAG: movzbl 28(%esp),
+; WIN32-DAG: movzwl 24(%esp),
+; WIN32-DAG: movzbl 20(%esp),
+; WIN32-DAG: movswl 16(%esp),
+; WIN32-DAG: movsbl 12(%esp),
 ; WIN32: retl
 

diff  --git a/llvm/test/CodeGen/X86/x86-cmov-converter.ll b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
index 9d55df0602f99..bfb402c7cf0c0 100644
--- a/llvm/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/llvm/test/CodeGen/X86/x86-cmov-converter.ll
@@ -766,8 +766,8 @@ define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #0
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl %esi, %r8d
 ; CHECK-NEXT:  .LBB9_2: # %entry
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %r8d, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 ;
 ; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group:
@@ -781,8 +781,8 @@ define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #0
 ; CHECK-FORCEALL-NEXT:    movl %edi, %eax
 ; CHECK-FORCEALL-NEXT:    movl %esi, %r8d
 ; CHECK-FORCEALL-NEXT:  .LBB9_2: # %entry
-; CHECK-FORCEALL-NEXT:    addl %edx, %eax
 ; CHECK-FORCEALL-NEXT:    addl %r8d, %eax
+; CHECK-FORCEALL-NEXT:    addl %edx, %eax
 ; CHECK-FORCEALL-NEXT:    retq
 entry:
   %cond = icmp ugt i32 %a, %b
@@ -808,8 +808,8 @@ define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    movl %esi, %r8d
 ; CHECK-NEXT:  .LBB10_2: # %entry
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %r8d, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    retq
 ;
 ; CHECK-FORCEALL-LABEL: test_cmov_memoperand_in_group2:
@@ -823,8 +823,8 @@ define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, ptr %y.ptr) #
 ; CHECK-FORCEALL-NEXT:    movl %edi, %eax
 ; CHECK-FORCEALL-NEXT:    movl %esi, %r8d
 ; CHECK-FORCEALL-NEXT:  .LBB10_2: # %entry
-; CHECK-FORCEALL-NEXT:    addl %edx, %eax
 ; CHECK-FORCEALL-NEXT:    addl %r8d, %eax
+; CHECK-FORCEALL-NEXT:    addl %edx, %eax
 ; CHECK-FORCEALL-NEXT:    retq
 entry:
   %cond = icmp ugt i32 %a, %b

diff  --git a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 455f0c706d70e..263ff329163c5 100644
--- a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -38,8 +38,8 @@ define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
 ; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    movl %edi, %edx
 ; CHECK-NEXT:    callq bar at PLT
-; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    addl %edx, %eax
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cvtsi2ss %eax, %xmm0
 ; CHECK-NEXT:    addss %xmm1, %xmm0

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 4adc80b3b8bd6..8d68303300ec6 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -213,53 +213,52 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    sarl $31, %ebx
 ; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebp
-; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    imull %ecx, %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebp, %edx
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    adcl %ebx, %esi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebp, %ecx
-; WIN32-NEXT:    adcl $0, %ebx
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl $0, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    adcl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %ebx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
@@ -575,29 +574,26 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    imull %ecx, %edi
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
 ; WIN32-NEXT:    addl %edx, %ecx
 ; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    imull %esi, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    imull %edi, %ebx
 ; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    addl %esi, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    adcl %ecx, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
@@ -606,20 +602,20 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl %esi, %edi
 ; WIN32-NEXT:    adcl $0, %ecx
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ebp, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    adcl %ecx, %edi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    adcl %ecx, %ebp
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
@@ -632,9 +628,9 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    jne LBB12_2
 ; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:  LBB12_2:
-; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    movl %edi, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -996,60 +992,60 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %ebp, %esi
 ; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %edi
+; WIN32-NEXT:    addl %esi, %edi
 ; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    imull %ecx, %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebx, %edx
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    adcl %edi, %esi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
 ; WIN32-NEXT:    adcl $0, %ebp
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %ecx, %ebx
-; WIN32-NEXT:    adcl %ebp, %edi
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl %ecx, %edi
+; WIN32-NEXT:    adcl %ebp, %ebx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    xorl %ebx, %edx
-; WIN32-NEXT:    xorl %eax, %ebx
-; WIN32-NEXT:    orl %edx, %ebx
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    xorl %edi, %edx
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    orl %edx, %edi
 ; WIN32-NEXT:    jne LBB18_1
 ; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
@@ -1712,20 +1708,20 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    movl %ebx, %ecx
 ; WIN32-NEXT:    imull %ebp, %ecx
 ; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    addl %ecx, %ebx
 ; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    addl %ecx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %esi, %ebx
 ; WIN32-NEXT:    addl %edx, %ebx
 ; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    imull %ecx, %edi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %edi, %ecx
 ; WIN32-NEXT:    addl %edx, %ecx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1809,55 +1805,55 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebp
-; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    movl (%eax), %ebx
+; WIN32-NEXT:    movl 4(%eax), %ebp
 ; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %esi, %edi
 ; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    imull %ecx, %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebx, %edx
 ; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %edx, %esi
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    adcl %edi, %esi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    adcl $0, %ebx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; WIN32-NEXT:    adcl $0, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    adcl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %ebx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -1872,7 +1868,7 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx


        


More information about the llvm-commits mailing list