[llvm] 30eff7f - [DAG] Attempt to replace a mul node with an existing umul_lohi/smul_lohi node (PR59217)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 29 04:51:41 PST 2022


Author: Simon Pilgrim
Date: 2022-11-29T12:51:30Z
New Revision: 30eff7f29f97599a94a40907f5b77244af0eaee1

URL: https://github.com/llvm/llvm-project/commit/30eff7f29f97599a94a40907f5b77244af0eaee1
DIFF: https://github.com/llvm/llvm-project/commit/30eff7f29f97599a94a40907f5b77244af0eaee1.diff

LOG: [DAG] Attempt to replace a mul node with an existing umul_lohi/smul_lohi node (PR59217)

As discussed on Issue #59217, under certain circumstances the DAG can generate duplicate MUL and MUL_LOHI nodes, often during MULO legalization.

This patch attempts to replace MUL nodes with additional uses of the LO result from the MUL_LOHI node

Differential Revision: https://reviews.llvm.org/D138790

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    llvm/test/CodeGen/X86/combine-mul.ll
    llvm/test/CodeGen/X86/muloti.ll
    llvm/test/CodeGen/X86/smul-with-overflow.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fd1259b693bc0..9970ba20610ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4034,6 +4034,21 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                                       getShiftAmountTy(N0.getValueType()))));
   }
 
+  // Attempt to reuse an existing umul_lohi/smul_lohi node, but only if the
+  // hi result is in use in case we hit this mid-legalization.
+  for (unsigned LoHiOpc : {ISD::UMUL_LOHI, ISD::SMUL_LOHI}) {
+    if (!LegalOperations || TLI.isOperationLegalOrCustom(LoHiOpc, VT)) {
+      SDVTList LoHiVT = DAG.getVTList(VT, VT);
+      // TODO: Can we match commutable operands with getNodeIfExists?
+      if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N0, N1}))
+        if (LoHi->hasAnyUseOfValue(1))
+          return SDValue(LoHi, 0);
+      if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N1, N0}))
+        if (LoHi->hasAnyUseOfValue(1))
+          return SDValue(LoHi, 0);
+    }
+  }
+
   // Try to transform:
   // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
   // mul x, (2^N + 1) --> add (shl x, N), x

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 0a62c42969bb9..fdcceea353bcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -34,21 +34,19 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v5, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -60,19 +58,17 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[6:7], s4, v4, v3, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v5, v3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v5
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v5, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s4, v5, v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -85,23 +81,21 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v5, v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GFX11-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GFX11-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v4, v5
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v6, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v11
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v5, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], null, v5, v3, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -157,28 +151,26 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v10, v6
+; GFX9-NEXT:    v_mad_i64_i32 v[10:11], s[4:5], v4, v3, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v11, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v7, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v6, v5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, v2, v5
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v7, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -195,21 +187,19 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_mad_u64_u32 v[9:10], s4, v5, v2, 0
 ; GFX10-NEXT:    v_mad_i64_i32 v[11:12], s4, v5, v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v1
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
+; GFX10-NEXT:    v_add3_u32 v1, v1, v6, v9
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v2
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v11
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v7, v2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
@@ -231,36 +221,34 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v4, v3, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v5, v2, 0
 ; GFX11-NEXT:    v_mad_i64_i32 v[11:12], null, v5, v3, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v1
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX11-NEXT:    v_mul_lo_u32 v8, v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v9
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
-; GFX11-NEXT:    v_mul_lo_u32 v9, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, v11
+; GFX11-NEXT:    v_add3_u32 v1, v1, v6, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v10, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v7, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT:    v_add3_u32 v1, v1, v9, v8
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v10, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index b7b90b72d19ae..9bac05eccabd5 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -437,18 +437,14 @@ define i64 @combine_mul_umul_lohi_i64(i64 %a, i64 %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    mulq %rsi
-; SSE-NEXT:    imulq %rsi, %rdi
-; SSE-NEXT:    xorq %rdx, %rdi
-; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    xorq %rdx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_umul_lohi_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    mulq %rsi
-; AVX-NEXT:    imulq %rsi, %rdi
-; AVX-NEXT:    xorq %rdx, %rdi
-; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    xorq %rdx, %rax
 ; AVX-NEXT:    retq
   %a128 = zext i64 %a to i128
   %b128 = zext i64 %b to i128
@@ -465,18 +461,14 @@ define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    imulq %rsi
-; SSE-NEXT:    imulq %rdi, %rsi
-; SSE-NEXT:    xorq %rdx, %rsi
-; SSE-NEXT:    movq %rsi, %rax
+; SSE-NEXT:    xorq %rdx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_smul_lohi_commute_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    imulq %rsi
-; AVX-NEXT:    imulq %rdi, %rsi
-; AVX-NEXT:    xorq %rdx, %rsi
-; AVX-NEXT:    movq %rsi, %rax
+; AVX-NEXT:    xorq %rdx, %rax
 ; AVX-NEXT:    retq
   %a128 = sext i64 %a to i128
   %b128 = sext i64 %b to i128
@@ -491,22 +483,18 @@ define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
 define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
 ; SSE-LABEL: combine_mul_umul_lohi_const_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; SSE-NEXT:    mulq %rcx
-; SSE-NEXT:    imulq %rdi, %rcx
-; SSE-NEXT:    xorq %rdx, %rcx
-; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    xorq %rdx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_umul_lohi_const_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; AVX-NEXT:    mulq %rcx
-; AVX-NEXT:    imulq %rdi, %rcx
-; AVX-NEXT:    xorq %rdx, %rcx
-; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    xorq %rdx, %rax
 ; AVX-NEXT:    retq
   %h128 = zext i64 %h to i128
   %m128 = mul nuw i128 %h128, 14181476777654086739
@@ -520,30 +508,26 @@ define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
 define i64 @combine_mul_smul_lohi_const_i64(i64 %h) {
 ; SSE-LABEL: combine_mul_smul_lohi_const_i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movq %rdi, %rsi
-; SSE-NEXT:    sarq $63, %rsi
-; SSE-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    mulq %rcx
-; SSE-NEXT:    imulq %rcx, %rsi
-; SSE-NEXT:    addq %rdx, %rsi
-; SSE-NEXT:    imulq %rdi, %rcx
-; SSE-NEXT:    xorq %rsi, %rcx
-; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    movq %rdi, %rcx
+; SSE-NEXT:    sarq $63, %rcx
+; SSE-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
+; SSE-NEXT:    mulq %rsi
+; SSE-NEXT:    imulq %rsi, %rcx
+; SSE-NEXT:    addq %rdx, %rcx
+; SSE-NEXT:    xorq %rcx, %rax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_mul_smul_lohi_const_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rsi
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
 ; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    mulq %rcx
-; AVX-NEXT:    imulq %rcx, %rsi
-; AVX-NEXT:    addq %rdx, %rsi
-; AVX-NEXT:    imulq %rdi, %rcx
-; AVX-NEXT:    xorq %rsi, %rcx
-; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    movq %rdi, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
+; AVX-NEXT:    mulq %rsi
+; AVX-NEXT:    imulq %rsi, %rcx
+; AVX-NEXT:    addq %rdx, %rcx
+; AVX-NEXT:    xorq %rcx, %rax
 ; AVX-NEXT:    retq
   %h128 = sext i64 %h to i128
   %m128 = mul nsw i128 %h128, 14181476777654086739

diff  --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 60a2e21dcd03d..a184d49ce75a7 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -13,54 +13,51 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movq %rdx, %r11
+; CHECK-NEXT:    movq %rdx, %r10
 ; CHECK-NEXT:    movq %rdi, %r9
-; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    sarq $63, %rbx
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    imulq %rbx, %rdi
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    mulq %rbx
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    imulq %rcx, %rbx
-; CHECK-NEXT:    addq %rdi, %rbx
-; CHECK-NEXT:    addq %rdx, %rbx
-; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    movq %rsi, %rdi
 ; CHECK-NEXT:    sarq $63, %rdi
-; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    movq %rcx, %r11
+; CHECK-NEXT:    imulq %rdi, %r11
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    mulq %rdi
+; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    addq %rax, %r11
+; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rax, %r14
 ; CHECK-NEXT:    imulq %rsi, %r14
-; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %r9
-; CHECK-NEXT:    movq %rax, %r10
-; CHECK-NEXT:    imulq %r9, %rdi
-; CHECK-NEXT:    addq %r14, %rdi
-; CHECK-NEXT:    addq %rdx, %rdi
-; CHECK-NEXT:    addq %r8, %r10
-; CHECK-NEXT:    adcq %rbx, %rdi
-; CHECK-NEXT:    movq %r9, %rax
-; CHECK-NEXT:    mulq %r11
-; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    mulq %r11
+; CHECK-NEXT:    addq %rax, %r14
+; CHECK-NEXT:    addq %rdx, %r14
+; CHECK-NEXT:    addq %rdi, %r8
+; CHECK-NEXT:    adcq %r11, %r14
+; CHECK-NEXT:    movq %r9, %rax
+; CHECK-NEXT:    mulq %r10
 ; CHECK-NEXT:    movq %rdx, %r11
-; CHECK-NEXT:    movq %rax, %r14
-; CHECK-NEXT:    addq %rbx, %r14
-; CHECK-NEXT:    adcq $0, %r11
+; CHECK-NEXT:    movq %rax, %rdi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    mulq %r10
+; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    addq %r11, %rbx
+; CHECK-NEXT:    adcq $0, %r10
 ; CHECK-NEXT:    movq %r9, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    movq %rdx, %r11
 ; CHECK-NEXT:    movq %rax, %r9
-; CHECK-NEXT:    addq %r14, %r9
-; CHECK-NEXT:    adcq %r11, %rbx
+; CHECK-NEXT:    addq %rbx, %r9
+; CHECK-NEXT:    adcq %r10, %r11
 ; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    movzbl %al, %r11d
+; CHECK-NEXT:    movzbl %al, %r10d
 ; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    addq %rbx, %rax
-; CHECK-NEXT:    adcq %r11, %rdx
-; CHECK-NEXT:    addq %r10, %rax
-; CHECK-NEXT:    adcq %rdi, %rdx
+; CHECK-NEXT:    addq %r11, %rax
+; CHECK-NEXT:    adcq %r10, %rdx
+; CHECK-NEXT:    addq %r8, %rax
+; CHECK-NEXT:    adcq %r14, %rdx
 ; CHECK-NEXT:    movq %r9, %rcx
 ; CHECK-NEXT:    sarq $63, %rcx
 ; CHECK-NEXT:    xorq %rcx, %rdx
@@ -68,7 +65,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    orq %rdx, %rcx
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:  ## %bb.2: ## %nooverflow
-; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movq %r9, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14

diff  --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 83b9460c7dae3..6d8b83824a6d5 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,7 +191,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $184, %esp
+; X86-NEXT:    subl $192, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
@@ -199,172 +199,139 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
@@ -372,40 +339,71 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -413,411 +411,401 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %esi
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %al, %ebp
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movzbl %cl, %ebx
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    setb %dl
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movzbl %dl, %esi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    setb %al
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl %al, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %al, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    xorl %edx, %ebp
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    andl $1, %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    xorl %edi, %eax
 ; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movb %dl, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movb %cl, 16(%eax)
 ; X86-NEXT:    setne 20(%eax)
-; X86-NEXT:    addl $184, %esp
+; X86-NEXT:    addl $192, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -832,189 +820,170 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    movq %rcx, %r15
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rsi, %r8
+; X64-NEXT:    movq %r8, %r13
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    negq %rsi
-; X64-NEXT:    andl $1, %r15d
-; X64-NEXT:    negq %r15
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    andl $1, %r11d
+; X64-NEXT:    negq %r11
+; X64-NEXT:    andl $1, %r10d
+; X64-NEXT:    negq %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    adcq %rdx, %r8
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    addq %rax, %r8
 ; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movzbl %dil, %r13d
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    adcq %rdx, %r13
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %r14, %rbp
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    addq %rbp, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %rdi
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    adcq %r13, %rsi
+; X64-NEXT:    setb %bpl
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movzbl %r11b, %edx
-; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    movzbl %bpl, %edx
+; X64-NEXT:    adcq %rdx, %r14
+; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movq %r12, %r9
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    adcq %rdi, %r14
+; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %r11, %rbp
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r13, %r15
+; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    addq %r8, %rbp
-; X64-NEXT:    adcq %rax, %r11
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    adcq %r13, %rbp
 ; X64-NEXT:    setb %al
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    movzbl %al, %r14d
-; X64-NEXT:    adcq %rdx, %r14
-; X64-NEXT:    addq %r8, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r10, %rbp
-; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    addq %rdi, %rbp
+; X64-NEXT:    movzbl %al, %r12d
+; X64-NEXT:    adcq %rdx, %r12
+; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %r14, %r15
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    addq %r8, %rbp
+; X64-NEXT:    adcq %rcx, %r12
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq %r13, %r14
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    adcq %rdx, %r14
 ; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    setb %cl
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    adcq %rdx, %rcx
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    movzbl %dil, %esi
+; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    adcq %r12, %r8
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    addq %rax, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r9, %rdi
 ; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    setb %al
-; X64-NEXT:    addq %rdi, %rcx
-; X64-NEXT:    movzbl %al, %edi
-; X64-NEXT:    adcq %r8, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    imulq %r15, %rax
-; X64-NEXT:    imulq %r15, %r12
-; X64-NEXT:    addq %rax, %r12
-; X64-NEXT:    addq %rdx, %r12
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    imulq %rsi
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r9, %r8
-; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    movzbl %al, %esi
+; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    imulq %r11
+; X64-NEXT:    movq %r9, %r11
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    movq %rdi, %r12
 ; X64-NEXT:    adcq %rdx, %r12
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq %rdi, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq %r13, %r15
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    addq %r13, %r10
+; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    adcq %rdx, %rcx
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    adcq %rsi, %r13
 ; X64-NEXT:    setb %r9b
-; X64-NEXT:    addq %rdi, %rcx
-; X64-NEXT:    movzbl %r9b, %edi
-; X64-NEXT:    adcq %rdx, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    imulq %rsi, %rdx
-; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    addq %rdx, %rsi
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT:    addq %rcx, %r13
+; X64-NEXT:    movzbl %r9b, %ecx
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %r10, %rdx
 ; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %r15
-; X64-NEXT:    adcq %r8, %rax
-; X64-NEXT:    adcq %r12, %rsi
-; X64-NEXT:    addq %r11, %r13
-; X64-NEXT:    adcq %rbx, %r15
-; X64-NEXT:    adcq %r10, %rax
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq %rdi, %r10
+; X64-NEXT:    adcq %r11, %rax
+; X64-NEXT:    adcq %r12, %rdx
+; X64-NEXT:    addq %rbp, %rbx
+; X64-NEXT:    adcq %r8, %r10
+; X64-NEXT:    adcq %r14, %rax
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    movq %r15, %rcx
 ; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    xorq %rcx, %r15
-; X64-NEXT:    orq %rsi, %r15
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    orq %rdx, %r10
 ; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    orq %r15, %rax
-; X64-NEXT:    xorq %r13, %rcx
+; X64-NEXT:    orq %r10, %rax
+; X64-NEXT:    xorq %rbx, %rcx
 ; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movl %eax, %esi
 ; X64-NEXT:    andl $1, %esi
 ; X64-NEXT:    movq %rsi, %rdx
 ; X64-NEXT:    negq %rdx
-; X64-NEXT:    xorq %rdx, %rbp
+; X64-NEXT:    xorq %rdx, %r15
 ; X64-NEXT:    xorq %rax, %rdx
-; X64-NEXT:    orq %rbp, %rdx
+; X64-NEXT:    orq %r15, %rdx
 ; X64-NEXT:    orq %rcx, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 0532916b1e4ca..fc5a54b3cf4ce 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -376,68 +376,65 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    imull %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel %ebx, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %ebp, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %ecx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index fc40c539f37c7..7113aaf7e83ed 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -14,61 +14,58 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %rbx, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
-; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    imulq %r14, %rdi
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %rcx, %r14
-; X64-NEXT:    addq %rdi, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    imulq %rdi, %rbx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    imulq %rsi, %r15
-; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    imulq %r10, %rdi
-; X64-NEXT:    addq %r15, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    adcq %r14, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    addq %rax, %r15
+; X64-NEXT:    addq %rdx, %r15
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    adcq %rbx, %r15
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r14, %r15
-; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rbx, %r14
+; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r15, %r10
-; X64-NEXT:    adcq %rbx, %r14
+; X64-NEXT:    addq %r14, %r10
+; X64-NEXT:    adcq %r11, %rbx
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ebx
+; X64-NEXT:    movzbl %al, %r11d
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %r11, %rdx
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    adcq %r15, %rdx
 ; X64-NEXT:    movq %r10, 8(%r8)
 ; X64-NEXT:    sarq $63, %r10
 ; X64-NEXT:    xorq %r10, %rdx
 ; X64-NEXT:    xorq %rax, %r10
 ; X64-NEXT:    orq %rdx, %r10
 ; X64-NEXT:    setne %al
-; X64-NEXT:    movq %r9, (%r8)
+; X64-NEXT:    movq %rdi, (%r8)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
@@ -84,8 +81,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 72
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 80
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -102,19 +99,20 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -126,197 +124,181 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %esi, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    imull %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    orl %ebp, %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -324,7 +306,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -358,232 +340,214 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rsi, %r11
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rsi, %rbx
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r8, %r10
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rbx, %r14
 ; X64-NEXT:    adcq %rcx, %r12
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r12, %r14
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r12, %rsi
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r8, %r13
+; X64-NEXT:    addq %rbx, %r13
 ; X64-NEXT:    adcq $0, %r12
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r9, %rsi
 ; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %r11
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r15, %r9
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r11, %r8
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r10, %rbp
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    adcq %r12, %rbx
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rbx, %rbp
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    addq %r10, %rbp
+; X64-NEXT:    adcq %r14, %r13
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r11, %r9
-; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    adcq %r13, %r10
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    adcq %r10, %r11
 ; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r10, %r13
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r11, %r8
 ; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    addq %r8, %rdi
+; X64-NEXT:    adcq %rax, %r10
+; X64-NEXT:    addq %rbp, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r13, %rdi
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rbp, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Folded Reload
-; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rsi, %r9
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    adcq %rdi, %r9
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    adcq %rdi, %r11
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %r11, %r13
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq %r13, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r11, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r10, %r9
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r13
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r10, %r13
-; X64-NEXT:    sarq $63, %r13
-; X64-NEXT:    movq %r13, %rcx
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    imulq %r13, %r15
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    addq %rdx, %r15
-; X64-NEXT:    movq %r13, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT:    imulq %rsi, %rcx
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    imulq %r13, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r15, %r8
+; X64-NEXT:    sarq $63, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %r15, %rbx
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r9, %r15
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    addq %r11, %r15
-; X64-NEXT:    adcq %r9, %r13
-; X64-NEXT:    setb %cl
-; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    movzbl %cl, %r9d
-; X64-NEXT:    adcq %rdx, %r9
-; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %rbx, %r9
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    addq %rsi, %r10
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r9, %r14
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r8, %r9
+; X64-NEXT:    imulq %r12, %r9
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq {{[0-9]+}}(%rsp)
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    addq %rdx, %r9
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    adcq %r10, %r9
+; X64-NEXT:    addq %r11, %r14
+; X64-NEXT:    movzbl %sil, %edi
+; X64-NEXT:    adcq %rcx, %rdi
+; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    adcq %r9, %rdi
 ; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    imulq %r12, %rbx
-; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT:    movq %r8, %rdx
-; X64-NEXT:    imulq %r12, %rdx
-; X64-NEXT:    imulq %r12, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq %r10, %rcx
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    addq %r10, %rbx
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    addq %r10, %rbx
-; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    addq %rax, %rbx
-; X64-NEXT:    adcq %rdx, %r10
-; X64-NEXT:    setb %r12b
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    movzbl %r12b, %eax
-; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %r8, %r10
-; X64-NEXT:    adcq %rcx, %rax
-; X64-NEXT:    addq %r11, %rsi
-; X64-NEXT:    adcq %r15, %rbx
-; X64-NEXT:    adcq %r13, %r10
-; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    adcq %rdx, %r11
+; X64-NEXT:    setb %bl
+; X64-NEXT:    imulq %r12, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    addq %rax, %r15
+; X64-NEXT:    addq %rdx, %r15
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    adcq %rcx, %r15
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    movzbl %bl, %edx
+; X64-NEXT:    adcq %r8, %rdx
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    adcq %r15, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %r10
-; X64-NEXT:    adcq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    xorq %rdx, %rax
-; X64-NEXT:    xorq %rdx, %rbx
-; X64-NEXT:    orq %rax, %rbx
-; X64-NEXT:    xorq %rdx, %r10
-; X64-NEXT:    xorq %rsi, %rdx
-; X64-NEXT:    orq %r10, %rdx
-; X64-NEXT:    orq %rbx, %rdx
+; X64-NEXT:    adcq %r10, %rcx
+; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %r11
+; X64-NEXT:    adcq %rbp, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    xorq %rax, %rdx
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    orq %r11, %rax
+; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rcx, 24(%rax)
+; X64-NEXT:    movq %rdi, 24(%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -609,8 +573,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 172
+; X86-NEXT:    subl $156, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 176
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -644,66 +608,65 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
@@ -772,43 +735,43 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
@@ -816,25 +779,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -857,83 +820,80 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebp, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
@@ -942,67 +902,68 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
@@ -1015,9 +976,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
@@ -1030,131 +991,132 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
@@ -1165,369 +1127,335 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    setb %dl
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %dl, %edx
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    imull %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edi, %edx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %edi
 ; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    setb %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %dl, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    orl %ebp, %eax
 ; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 28(%eax)
+; X86-NEXT:    movl %ebx, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1543,7 +1471,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $152, %esp
+; X86-NEXT:    addl $156, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7bb7c9b481d39..fef44e062fd3b 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,112 +3297,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    movq %r8, %r14
+; SSE2-NEXT:    movq %rcx, %r13
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rsi, %r11
 ; SSE2-NEXT:    movq %rdi, %r10
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    movq %r11, %r12
-; SSE2-NEXT:    sarq $63, %r12
-; SSE2-NEXT:    movq %r14, %rbx
-; SSE2-NEXT:    imulq %r12, %rbx
+; SSE2-NEXT:    movq %r11, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %r9, %r15
+; SSE2-NEXT:    imulq %rcx, %r15
 ; SSE2-NEXT:    movq %r14, %rax
-; SSE2-NEXT:    mulq %r12
+; SSE2-NEXT:    mulq %rcx
 ; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    imulq %r9, %r12
-; SSE2-NEXT:    addq %rbx, %r12
-; SSE2-NEXT:    addq %rdx, %r12
-; SSE2-NEXT:    movq %r9, %rbx
-; SSE2-NEXT:    sarq $63, %rbx
-; SSE2-NEXT:    movq %rbx, %r13
-; SSE2-NEXT:    imulq %r11, %r13
-; SSE2-NEXT:    movq %rbx, %rax
+; SSE2-NEXT:    addq %rax, %r15
+; SSE2-NEXT:    addq %rdx, %r15
+; SSE2-NEXT:    movq %r9, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    imulq %r11, %rcx
 ; SSE2-NEXT:    mulq %r10
-; SSE2-NEXT:    movq %rax, %r15
-; SSE2-NEXT:    imulq %r10, %rbx
-; SSE2-NEXT:    addq %r13, %rbx
-; SSE2-NEXT:    addq %rdx, %rbx
-; SSE2-NEXT:    addq %rdi, %r15
-; SSE2-NEXT:    adcq %r12, %rbx
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    addq %rax, %rcx
+; SSE2-NEXT:    addq %rdx, %rcx
+; SSE2-NEXT:    addq %rdi, %rbx
+; SSE2-NEXT:    adcq %r15, %rcx
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %r14
-; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    movq %rdx, %r15
 ; SSE2-NEXT:    movq %rax, %rdi
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r14
 ; SSE2-NEXT:    movq %rdx, %r14
-; SSE2-NEXT:    movq %rax, %r13
-; SSE2-NEXT:    addq %r12, %r13
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    addq %r15, %r12
 ; SSE2-NEXT:    adcq $0, %r14
 ; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %r9
-; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    movq %rdx, %r15
 ; SSE2-NEXT:    movq %rax, %r10
-; SSE2-NEXT:    addq %r13, %r10
-; SSE2-NEXT:    adcq %r14, %r12
+; SSE2-NEXT:    addq %r12, %r10
+; SSE2-NEXT:    adcq %r14, %r15
 ; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    movzbl %al, %r14d
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r9
-; SSE2-NEXT:    addq %r12, %rax
-; SSE2-NEXT:    adcq %r14, %rdx
 ; SSE2-NEXT:    addq %r15, %rax
-; SSE2-NEXT:    adcq %rbx, %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT:    movq %r10, 8(%r12)
+; SSE2-NEXT:    adcq %r14, %rdx
+; SSE2-NEXT:    addq %rbx, %rax
+; SSE2-NEXT:    adcq %rcx, %rdx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT:    movq %r10, 8(%r15)
 ; SSE2-NEXT:    sarq $63, %r10
 ; SSE2-NEXT:    xorq %r10, %rdx
 ; SSE2-NEXT:    xorq %rax, %r10
-; SSE2-NEXT:    xorl %r15d, %r15d
+; SSE2-NEXT:    xorl %ecx, %ecx
 ; SSE2-NEXT:    orq %rdx, %r10
-; SSE2-NEXT:    setne %r15b
-; SSE2-NEXT:    movq %rcx, %rbx
-; SSE2-NEXT:    sarq $63, %rbx
-; SSE2-NEXT:    movq %rsi, %r10
-; SSE2-NEXT:    imulq %rbx, %r10
+; SSE2-NEXT:    setne %cl
+; SSE2-NEXT:    movq %r13, %r9
+; SSE2-NEXT:    sarq $63, %r9
+; SSE2-NEXT:    movq %rbp, %r11
+; SSE2-NEXT:    imulq %r9, %r11
 ; SSE2-NEXT:    movq %rsi, %rax
-; SSE2-NEXT:    mulq %rbx
+; SSE2-NEXT:    mulq %r9
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    imulq %rbp, %rbx
-; SSE2-NEXT:    addq %r10, %rbx
-; SSE2-NEXT:    addq %rdx, %rbx
-; SSE2-NEXT:    movq %rbp, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    movq %r10, %r14
-; SSE2-NEXT:    imulq %rcx, %r14
-; SSE2-NEXT:    movq %r10, %rax
+; SSE2-NEXT:    addq %rax, %r11
+; SSE2-NEXT:    addq %rdx, %r11
+; SSE2-NEXT:    movq %rbp, %rax
+; SSE2-NEXT:    sarq $63, %rax
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    imulq %r13, %r14
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    imulq %r8, %r10
-; SSE2-NEXT:    addq %r14, %r10
-; SSE2-NEXT:    addq %rdx, %r10
-; SSE2-NEXT:    addq %r9, %r11
-; SSE2-NEXT:    adcq %rbx, %r10
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    addq %rax, %r14
+; SSE2-NEXT:    addq %rdx, %r14
+; SSE2-NEXT:    addq %r9, %r10
+; SSE2-NEXT:    adcq %r11, %r14
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %r9
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    movq %r13, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    movq %rax, %r14
-; SSE2-NEXT:    addq %r9, %r14
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    addq %r9, %rbx
 ; SSE2-NEXT:    adcq $0, %rsi
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %r14, %r9
+; SSE2-NEXT:    addq %rbx, %r9
 ; SSE2-NEXT:    adcq %rsi, %r8
 ; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    movzbl %al, %esi
-; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    movq %r13, %rax
 ; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    addq %r8, %rax
 ; SSE2-NEXT:    adcq %rsi, %rdx
-; SSE2-NEXT:    addq %r11, %rax
-; SSE2-NEXT:    adcq %r10, %rdx
-; SSE2-NEXT:    movq %r9, 24(%r12)
+; SSE2-NEXT:    addq %r10, %rax
+; SSE2-NEXT:    adcq %r14, %rdx
+; SSE2-NEXT:    movq %r9, 24(%r15)
 ; SSE2-NEXT:    sarq $63, %r9
 ; SSE2-NEXT:    xorq %r9, %rdx
 ; SSE2-NEXT:    xorq %rax, %r9
@@ -3411,11 +3406,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    negl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    negl %r15d
-; SSE2-NEXT:    movd %r15d, %xmm0
+; SSE2-NEXT:    negl %ecx
+; SSE2-NEXT:    movd %ecx, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %rbx, 16(%r12)
-; SSE2-NEXT:    movq %rdi, (%r12)
+; SSE2-NEXT:    movq %r11, 16(%r15)
+; SSE2-NEXT:    movq %rdi, (%r15)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -3433,112 +3428,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    pushq %r12
 ; SSSE3-NEXT:    pushq %rbx
 ; SSSE3-NEXT:    movq %r8, %r14
+; SSSE3-NEXT:    movq %rcx, %r13
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rsi, %r11
 ; SSSE3-NEXT:    movq %rdi, %r10
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT:    movq %r11, %r12
-; SSSE3-NEXT:    sarq $63, %r12
-; SSSE3-NEXT:    movq %r14, %rbx
-; SSSE3-NEXT:    imulq %r12, %rbx
+; SSSE3-NEXT:    movq %r11, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movq %r9, %r15
+; SSSE3-NEXT:    imulq %rcx, %r15
 ; SSSE3-NEXT:    movq %r14, %rax
-; SSSE3-NEXT:    mulq %r12
+; SSSE3-NEXT:    mulq %rcx
 ; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    imulq %r9, %r12
-; SSSE3-NEXT:    addq %rbx, %r12
-; SSSE3-NEXT:    addq %rdx, %r12
-; SSSE3-NEXT:    movq %r9, %rbx
-; SSSE3-NEXT:    sarq $63, %rbx
-; SSSE3-NEXT:    movq %rbx, %r13
-; SSSE3-NEXT:    imulq %r11, %r13
-; SSSE3-NEXT:    movq %rbx, %rax
+; SSSE3-NEXT:    addq %rax, %r15
+; SSSE3-NEXT:    addq %rdx, %r15
+; SSSE3-NEXT:    movq %r9, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movq %rax, %rcx
+; SSSE3-NEXT:    imulq %r11, %rcx
 ; SSSE3-NEXT:    mulq %r10
-; SSSE3-NEXT:    movq %rax, %r15
-; SSSE3-NEXT:    imulq %r10, %rbx
-; SSSE3-NEXT:    addq %r13, %rbx
-; SSSE3-NEXT:    addq %rdx, %rbx
-; SSSE3-NEXT:    addq %rdi, %r15
-; SSSE3-NEXT:    adcq %r12, %rbx
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    addq %rax, %rcx
+; SSSE3-NEXT:    addq %rdx, %rcx
+; SSSE3-NEXT:    addq %rdi, %rbx
+; SSSE3-NEXT:    adcq %r15, %rcx
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %r14
-; SSSE3-NEXT:    movq %rdx, %r12
+; SSSE3-NEXT:    movq %rdx, %r15
 ; SSSE3-NEXT:    movq %rax, %rdi
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r14
 ; SSSE3-NEXT:    movq %rdx, %r14
-; SSSE3-NEXT:    movq %rax, %r13
-; SSSE3-NEXT:    addq %r12, %r13
+; SSSE3-NEXT:    movq %rax, %r12
+; SSSE3-NEXT:    addq %r15, %r12
 ; SSSE3-NEXT:    adcq $0, %r14
 ; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %r9
-; SSSE3-NEXT:    movq %rdx, %r12
+; SSSE3-NEXT:    movq %rdx, %r15
 ; SSSE3-NEXT:    movq %rax, %r10
-; SSSE3-NEXT:    addq %r13, %r10
-; SSSE3-NEXT:    adcq %r14, %r12
+; SSSE3-NEXT:    addq %r12, %r10
+; SSSE3-NEXT:    adcq %r14, %r15
 ; SSSE3-NEXT:    setb %al
 ; SSSE3-NEXT:    movzbl %al, %r14d
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r9
-; SSSE3-NEXT:    addq %r12, %rax
-; SSSE3-NEXT:    adcq %r14, %rdx
 ; SSSE3-NEXT:    addq %r15, %rax
-; SSSE3-NEXT:    adcq %rbx, %rdx
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSSE3-NEXT:    movq %r10, 8(%r12)
+; SSSE3-NEXT:    adcq %r14, %rdx
+; SSSE3-NEXT:    addq %rbx, %rax
+; SSSE3-NEXT:    adcq %rcx, %rdx
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT:    movq %r10, 8(%r15)
 ; SSSE3-NEXT:    sarq $63, %r10
 ; SSSE3-NEXT:    xorq %r10, %rdx
 ; SSSE3-NEXT:    xorq %rax, %r10
-; SSSE3-NEXT:    xorl %r15d, %r15d
+; SSSE3-NEXT:    xorl %ecx, %ecx
 ; SSSE3-NEXT:    orq %rdx, %r10
-; SSSE3-NEXT:    setne %r15b
-; SSSE3-NEXT:    movq %rcx, %rbx
-; SSSE3-NEXT:    sarq $63, %rbx
-; SSSE3-NEXT:    movq %rsi, %r10
-; SSSE3-NEXT:    imulq %rbx, %r10
+; SSSE3-NEXT:    setne %cl
+; SSSE3-NEXT:    movq %r13, %r9
+; SSSE3-NEXT:    sarq $63, %r9
+; SSSE3-NEXT:    movq %rbp, %r11
+; SSSE3-NEXT:    imulq %r9, %r11
 ; SSSE3-NEXT:    movq %rsi, %rax
-; SSSE3-NEXT:    mulq %rbx
+; SSSE3-NEXT:    mulq %r9
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    imulq %rbp, %rbx
-; SSSE3-NEXT:    addq %r10, %rbx
-; SSSE3-NEXT:    addq %rdx, %rbx
-; SSSE3-NEXT:    movq %rbp, %r10
-; SSSE3-NEXT:    sarq $63, %r10
-; SSSE3-NEXT:    movq %r10, %r14
-; SSSE3-NEXT:    imulq %rcx, %r14
-; SSSE3-NEXT:    movq %r10, %rax
+; SSSE3-NEXT:    addq %rax, %r11
+; SSSE3-NEXT:    addq %rdx, %r11
+; SSSE3-NEXT:    movq %rbp, %rax
+; SSSE3-NEXT:    sarq $63, %rax
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    imulq %r13, %r14
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    imulq %r8, %r10
-; SSSE3-NEXT:    addq %r14, %r10
-; SSSE3-NEXT:    addq %rdx, %r10
-; SSSE3-NEXT:    addq %r9, %r11
-; SSSE3-NEXT:    adcq %rbx, %r10
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    addq %rax, %r14
+; SSSE3-NEXT:    addq %rdx, %r14
+; SSSE3-NEXT:    addq %r9, %r10
+; SSSE3-NEXT:    adcq %r11, %r14
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %r9
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    movq %r13, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    movq %rax, %r14
-; SSSE3-NEXT:    addq %r9, %r14
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    addq %r9, %rbx
 ; SSSE3-NEXT:    adcq $0, %rsi
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %r14, %r9
+; SSSE3-NEXT:    addq %rbx, %r9
 ; SSSE3-NEXT:    adcq %rsi, %r8
 ; SSSE3-NEXT:    setb %al
 ; SSSE3-NEXT:    movzbl %al, %esi
-; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    movq %r13, %rax
 ; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    addq %r8, %rax
 ; SSSE3-NEXT:    adcq %rsi, %rdx
-; SSSE3-NEXT:    addq %r11, %rax
-; SSSE3-NEXT:    adcq %r10, %rdx
-; SSSE3-NEXT:    movq %r9, 24(%r12)
+; SSSE3-NEXT:    addq %r10, %rax
+; SSSE3-NEXT:    adcq %r14, %rdx
+; SSSE3-NEXT:    movq %r9, 24(%r15)
 ; SSSE3-NEXT:    sarq $63, %r9
 ; SSSE3-NEXT:    xorq %r9, %rdx
 ; SSSE3-NEXT:    xorq %rax, %r9
@@ -3547,11 +3537,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    setne %al
 ; SSSE3-NEXT:    negl %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    negl %r15d
-; SSSE3-NEXT:    movd %r15d, %xmm0
+; SSSE3-NEXT:    negl %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %rbx, 16(%r12)
-; SSSE3-NEXT:    movq %rdi, (%r12)
+; SSSE3-NEXT:    movq %r11, 16(%r15)
+; SSSE3-NEXT:    movq %rdi, (%r15)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
 ; SSSE3-NEXT:    popq %r13
@@ -3569,112 +3559,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    movq %r8, %r14
+; SSE41-NEXT:    movq %rcx, %r13
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rsi, %r11
 ; SSE41-NEXT:    movq %rdi, %r10
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movq %r11, %r12
-; SSE41-NEXT:    sarq $63, %r12
-; SSE41-NEXT:    movq %r14, %rbx
-; SSE41-NEXT:    imulq %r12, %rbx
+; SSE41-NEXT:    movq %r11, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %r9, %r15
+; SSE41-NEXT:    imulq %rcx, %r15
 ; SSE41-NEXT:    movq %r14, %rax
-; SSE41-NEXT:    mulq %r12
+; SSE41-NEXT:    mulq %rcx
 ; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    imulq %r9, %r12
-; SSE41-NEXT:    addq %rbx, %r12
-; SSE41-NEXT:    addq %rdx, %r12
-; SSE41-NEXT:    movq %r9, %rbx
-; SSE41-NEXT:    sarq $63, %rbx
-; SSE41-NEXT:    movq %rbx, %r13
-; SSE41-NEXT:    imulq %r11, %r13
-; SSE41-NEXT:    movq %rbx, %rax
+; SSE41-NEXT:    addq %rax, %r15
+; SSE41-NEXT:    addq %rdx, %r15
+; SSE41-NEXT:    movq %r9, %rax
+; SSE41-NEXT:    sarq $63, %rax
+; SSE41-NEXT:    movq %rax, %rcx
+; SSE41-NEXT:    imulq %r11, %rcx
 ; SSE41-NEXT:    mulq %r10
-; SSE41-NEXT:    movq %rax, %r15
-; SSE41-NEXT:    imulq %r10, %rbx
-; SSE41-NEXT:    addq %r13, %rbx
-; SSE41-NEXT:    addq %rdx, %rbx
-; SSE41-NEXT:    addq %rdi, %r15
-; SSE41-NEXT:    adcq %r12, %rbx
+; SSE41-NEXT:    movq %rax, %rbx
+; SSE41-NEXT:    addq %rax, %rcx
+; SSE41-NEXT:    addq %rdx, %rcx
+; SSE41-NEXT:    addq %rdi, %rbx
+; SSE41-NEXT:    adcq %r15, %rcx
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %r14
-; SSE41-NEXT:    movq %rdx, %r12
+; SSE41-NEXT:    movq %rdx, %r15
 ; SSE41-NEXT:    movq %rax, %rdi
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r14
 ; SSE41-NEXT:    movq %rdx, %r14
-; SSE41-NEXT:    movq %rax, %r13
-; SSE41-NEXT:    addq %r12, %r13
+; SSE41-NEXT:    movq %rax, %r12
+; SSE41-NEXT:    addq %r15, %r12
 ; SSE41-NEXT:    adcq $0, %r14
 ; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %r9
-; SSE41-NEXT:    movq %rdx, %r12
+; SSE41-NEXT:    movq %rdx, %r15
 ; SSE41-NEXT:    movq %rax, %r10
-; SSE41-NEXT:    addq %r13, %r10
-; SSE41-NEXT:    adcq %r14, %r12
+; SSE41-NEXT:    addq %r12, %r10
+; SSE41-NEXT:    adcq %r14, %r15
 ; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    movzbl %al, %r14d
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r9
-; SSE41-NEXT:    addq %r12, %rax
-; SSE41-NEXT:    adcq %r14, %rdx
 ; SSE41-NEXT:    addq %r15, %rax
-; SSE41-NEXT:    adcq %rbx, %rdx
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT:    movq %r10, 8(%r12)
+; SSE41-NEXT:    adcq %r14, %rdx
+; SSE41-NEXT:    addq %rbx, %rax
+; SSE41-NEXT:    adcq %rcx, %rdx
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT:    movq %r10, 8(%r15)
 ; SSE41-NEXT:    sarq $63, %r10
 ; SSE41-NEXT:    xorq %r10, %rdx
 ; SSE41-NEXT:    xorq %rax, %r10
-; SSE41-NEXT:    xorl %r15d, %r15d
+; SSE41-NEXT:    xorl %ecx, %ecx
 ; SSE41-NEXT:    orq %rdx, %r10
-; SSE41-NEXT:    setne %r15b
-; SSE41-NEXT:    movq %rcx, %rbx
-; SSE41-NEXT:    sarq $63, %rbx
-; SSE41-NEXT:    movq %rsi, %r10
-; SSE41-NEXT:    imulq %rbx, %r10
+; SSE41-NEXT:    setne %cl
+; SSE41-NEXT:    movq %r13, %r9
+; SSE41-NEXT:    sarq $63, %r9
+; SSE41-NEXT:    movq %rbp, %r11
+; SSE41-NEXT:    imulq %r9, %r11
 ; SSE41-NEXT:    movq %rsi, %rax
-; SSE41-NEXT:    mulq %rbx
+; SSE41-NEXT:    mulq %r9
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    imulq %rbp, %rbx
-; SSE41-NEXT:    addq %r10, %rbx
-; SSE41-NEXT:    addq %rdx, %rbx
-; SSE41-NEXT:    movq %rbp, %r10
-; SSE41-NEXT:    sarq $63, %r10
-; SSE41-NEXT:    movq %r10, %r14
-; SSE41-NEXT:    imulq %rcx, %r14
-; SSE41-NEXT:    movq %r10, %rax
+; SSE41-NEXT:    addq %rax, %r11
+; SSE41-NEXT:    addq %rdx, %r11
+; SSE41-NEXT:    movq %rbp, %rax
+; SSE41-NEXT:    sarq $63, %rax
+; SSE41-NEXT:    movq %rax, %r14
+; SSE41-NEXT:    imulq %r13, %r14
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    imulq %r8, %r10
-; SSE41-NEXT:    addq %r14, %r10
-; SSE41-NEXT:    addq %rdx, %r10
-; SSE41-NEXT:    addq %r9, %r11
-; SSE41-NEXT:    adcq %rbx, %r10
+; SSE41-NEXT:    movq %rax, %r10
+; SSE41-NEXT:    addq %rax, %r14
+; SSE41-NEXT:    addq %rdx, %r14
+; SSE41-NEXT:    addq %r9, %r10
+; SSE41-NEXT:    adcq %r11, %r14
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %r9
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    movq %rax, %r11
+; SSE41-NEXT:    movq %r13, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    movq %rax, %r14
-; SSE41-NEXT:    addq %r9, %r14
+; SSE41-NEXT:    movq %rax, %rbx
+; SSE41-NEXT:    addq %r9, %rbx
 ; SSE41-NEXT:    adcq $0, %rsi
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %r14, %r9
+; SSE41-NEXT:    addq %rbx, %r9
 ; SSE41-NEXT:    adcq %rsi, %r8
 ; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    movzbl %al, %esi
-; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    movq %r13, %rax
 ; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    addq %r8, %rax
 ; SSE41-NEXT:    adcq %rsi, %rdx
-; SSE41-NEXT:    addq %r11, %rax
-; SSE41-NEXT:    adcq %r10, %rdx
-; SSE41-NEXT:    movq %r9, 24(%r12)
+; SSE41-NEXT:    addq %r10, %rax
+; SSE41-NEXT:    adcq %r14, %rdx
+; SSE41-NEXT:    movq %r9, 24(%r15)
 ; SSE41-NEXT:    sarq $63, %r9
 ; SSE41-NEXT:    xorq %r9, %rdx
 ; SSE41-NEXT:    xorq %rax, %r9
@@ -3682,11 +3667,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    orq %rdx, %r9
 ; SSE41-NEXT:    setne %al
 ; SSE41-NEXT:    negl %eax
-; SSE41-NEXT:    negl %r15d
-; SSE41-NEXT:    movd %r15d, %xmm0
+; SSE41-NEXT:    negl %ecx
+; SSE41-NEXT:    movd %ecx, %xmm0
 ; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movq %rbx, 16(%r12)
-; SSE41-NEXT:    movq %rdi, (%r12)
+; SSE41-NEXT:    movq %r11, 16(%r15)
+; SSE41-NEXT:    movq %rdi, (%r15)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
 ; SSE41-NEXT:    popq %r13
@@ -3704,112 +3689,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    movq %r8, %r14
+; AVX-NEXT:    movq %rcx, %r13
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rsi, %r11
 ; AVX-NEXT:    movq %rdi, %r10
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movq %r11, %r12
-; AVX-NEXT:    sarq $63, %r12
-; AVX-NEXT:    movq %r14, %rbx
-; AVX-NEXT:    imulq %r12, %rbx
+; AVX-NEXT:    movq %r11, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movq %r9, %r15
+; AVX-NEXT:    imulq %rcx, %r15
 ; AVX-NEXT:    movq %r14, %rax
-; AVX-NEXT:    mulq %r12
+; AVX-NEXT:    mulq %rcx
 ; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    imulq %r9, %r12
-; AVX-NEXT:    addq %rbx, %r12
-; AVX-NEXT:    addq %rdx, %r12
-; AVX-NEXT:    movq %r9, %rbx
-; AVX-NEXT:    sarq $63, %rbx
-; AVX-NEXT:    movq %rbx, %r13
-; AVX-NEXT:    imulq %r11, %r13
-; AVX-NEXT:    movq %rbx, %rax
+; AVX-NEXT:    addq %rax, %r15
+; AVX-NEXT:    addq %rdx, %r15
+; AVX-NEXT:    movq %r9, %rax
+; AVX-NEXT:    sarq $63, %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    imulq %r11, %rcx
 ; AVX-NEXT:    mulq %r10
-; AVX-NEXT:    movq %rax, %r15
-; AVX-NEXT:    imulq %r10, %rbx
-; AVX-NEXT:    addq %r13, %rbx
-; AVX-NEXT:    addq %rdx, %rbx
-; AVX-NEXT:    addq %rdi, %r15
-; AVX-NEXT:    adcq %r12, %rbx
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    addq %rax, %rcx
+; AVX-NEXT:    addq %rdx, %rcx
+; AVX-NEXT:    addq %rdi, %rbx
+; AVX-NEXT:    adcq %r15, %rcx
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %r14
-; AVX-NEXT:    movq %rdx, %r12
+; AVX-NEXT:    movq %rdx, %r15
 ; AVX-NEXT:    movq %rax, %rdi
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r14
 ; AVX-NEXT:    movq %rdx, %r14
-; AVX-NEXT:    movq %rax, %r13
-; AVX-NEXT:    addq %r12, %r13
+; AVX-NEXT:    movq %rax, %r12
+; AVX-NEXT:    addq %r15, %r12
 ; AVX-NEXT:    adcq $0, %r14
 ; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %r9
-; AVX-NEXT:    movq %rdx, %r12
+; AVX-NEXT:    movq %rdx, %r15
 ; AVX-NEXT:    movq %rax, %r10
-; AVX-NEXT:    addq %r13, %r10
-; AVX-NEXT:    adcq %r14, %r12
+; AVX-NEXT:    addq %r12, %r10
+; AVX-NEXT:    adcq %r14, %r15
 ; AVX-NEXT:    setb %al
 ; AVX-NEXT:    movzbl %al, %r14d
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r9
-; AVX-NEXT:    addq %r12, %rax
-; AVX-NEXT:    adcq %r14, %rdx
 ; AVX-NEXT:    addq %r15, %rax
-; AVX-NEXT:    adcq %rbx, %rdx
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT:    movq %r10, 8(%r12)
+; AVX-NEXT:    adcq %r14, %rdx
+; AVX-NEXT:    addq %rbx, %rax
+; AVX-NEXT:    adcq %rcx, %rdx
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT:    movq %r10, 8(%r15)
 ; AVX-NEXT:    sarq $63, %r10
 ; AVX-NEXT:    xorq %r10, %rdx
 ; AVX-NEXT:    xorq %rax, %r10
-; AVX-NEXT:    xorl %r15d, %r15d
+; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    orq %rdx, %r10
-; AVX-NEXT:    setne %r15b
-; AVX-NEXT:    movq %rcx, %rbx
-; AVX-NEXT:    sarq $63, %rbx
-; AVX-NEXT:    movq %rsi, %r10
-; AVX-NEXT:    imulq %rbx, %r10
+; AVX-NEXT:    setne %cl
+; AVX-NEXT:    movq %r13, %r9
+; AVX-NEXT:    sarq $63, %r9
+; AVX-NEXT:    movq %rbp, %r11
+; AVX-NEXT:    imulq %r9, %r11
 ; AVX-NEXT:    movq %rsi, %rax
-; AVX-NEXT:    mulq %rbx
+; AVX-NEXT:    mulq %r9
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    imulq %rbp, %rbx
-; AVX-NEXT:    addq %r10, %rbx
-; AVX-NEXT:    addq %rdx, %rbx
-; AVX-NEXT:    movq %rbp, %r10
-; AVX-NEXT:    sarq $63, %r10
-; AVX-NEXT:    movq %r10, %r14
-; AVX-NEXT:    imulq %rcx, %r14
-; AVX-NEXT:    movq %r10, %rax
+; AVX-NEXT:    addq %rax, %r11
+; AVX-NEXT:    addq %rdx, %r11
+; AVX-NEXT:    movq %rbp, %rax
+; AVX-NEXT:    sarq $63, %rax
+; AVX-NEXT:    movq %rax, %r14
+; AVX-NEXT:    imulq %r13, %r14
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    imulq %r8, %r10
-; AVX-NEXT:    addq %r14, %r10
-; AVX-NEXT:    addq %rdx, %r10
-; AVX-NEXT:    addq %r9, %r11
-; AVX-NEXT:    adcq %rbx, %r10
+; AVX-NEXT:    movq %rax, %r10
+; AVX-NEXT:    addq %rax, %r14
+; AVX-NEXT:    addq %rdx, %r14
+; AVX-NEXT:    addq %r9, %r10
+; AVX-NEXT:    adcq %r11, %r14
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %r9
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    movq %rax, %r11
+; AVX-NEXT:    movq %r13, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    movq %rax, %r14
-; AVX-NEXT:    addq %r9, %r14
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    addq %r9, %rbx
 ; AVX-NEXT:    adcq $0, %rsi
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %r14, %r9
+; AVX-NEXT:    addq %rbx, %r9
 ; AVX-NEXT:    adcq %rsi, %r8
 ; AVX-NEXT:    setb %al
 ; AVX-NEXT:    movzbl %al, %esi
-; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    movq %r13, %rax
 ; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    addq %r8, %rax
 ; AVX-NEXT:    adcq %rsi, %rdx
-; AVX-NEXT:    addq %r11, %rax
-; AVX-NEXT:    adcq %r10, %rdx
-; AVX-NEXT:    movq %r9, 24(%r12)
+; AVX-NEXT:    addq %r10, %rax
+; AVX-NEXT:    adcq %r14, %rdx
+; AVX-NEXT:    movq %r9, 24(%r15)
 ; AVX-NEXT:    sarq $63, %r9
 ; AVX-NEXT:    xorq %r9, %rdx
 ; AVX-NEXT:    xorq %rax, %r9
@@ -3817,11 +3797,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    orq %rdx, %r9
 ; AVX-NEXT:    setne %al
 ; AVX-NEXT:    negl %eax
-; AVX-NEXT:    negl %r15d
-; AVX-NEXT:    vmovd %r15d, %xmm0
+; AVX-NEXT:    negl %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm0
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movq %rbx, 16(%r12)
-; AVX-NEXT:    movq %rdi, (%r12)
+; AVX-NEXT:    movq %r11, 16(%r15)
+; AVX-NEXT:    movq %rdi, (%r15)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
 ; AVX-NEXT:    popq %r13
@@ -3838,113 +3818,104 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq %r9, %rbp
 ; AVX512F-NEXT:    movq %rcx, %r11
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rsi, %r9
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT:    movq %rcx, %r12
-; AVX512F-NEXT:    sarq $63, %r12
-; AVX512F-NEXT:    movq %r15, %rbx
-; AVX512F-NEXT:    imulq %r12, %rbx
-; AVX512F-NEXT:    movq %r15, %rax
-; AVX512F-NEXT:    mulq %r12
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    imulq %rsi, %r12
-; AVX512F-NEXT:    addq %rbx, %r12
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512F-NEXT:    sarq $63, %rcx
+; AVX512F-NEXT:    movq %rbp, %r12
+; AVX512F-NEXT:    imulq %rcx, %r12
+; AVX512F-NEXT:    movq %r14, %rax
+; AVX512F-NEXT:    mulq %rcx
+; AVX512F-NEXT:    movq %rax, %r15
+; AVX512F-NEXT:    addq %rax, %r12
 ; AVX512F-NEXT:    addq %rdx, %r12
-; AVX512F-NEXT:    movq %rsi, %rbx
-; AVX512F-NEXT:    sarq $63, %rbx
-; AVX512F-NEXT:    movq %rbx, %r13
-; AVX512F-NEXT:    imulq %r11, %r13
-; AVX512F-NEXT:    movq %rbx, %rax
+; AVX512F-NEXT:    movq %rbp, %rax
+; AVX512F-NEXT:    sarq $63, %rax
+; AVX512F-NEXT:    movq %rax, %rcx
+; AVX512F-NEXT:    imulq %r11, %rcx
 ; AVX512F-NEXT:    mulq %r10
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    imulq %r10, %rbx
-; AVX512F-NEXT:    addq %r13, %rbx
-; AVX512F-NEXT:    addq %rdx, %rbx
-; AVX512F-NEXT:    addq %rcx, %r14
-; AVX512F-NEXT:    adcq %r12, %rbx
+; AVX512F-NEXT:    movq %rax, %rbx
+; AVX512F-NEXT:    addq %rax, %rcx
+; AVX512F-NEXT:    addq %rdx, %rcx
+; AVX512F-NEXT:    addq %r15, %rbx
+; AVX512F-NEXT:    adcq %r12, %rcx
 ; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %r15
-; AVX512F-NEXT:    movq %rdx, %r12
-; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %r15
+; AVX512F-NEXT:    mulq %r14
 ; AVX512F-NEXT:    movq %rdx, %r15
-; AVX512F-NEXT:    movq %rax, %r13
-; AVX512F-NEXT:    addq %r12, %r13
-; AVX512F-NEXT:    adcq $0, %r15
+; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT:    movq %r11, %rax
+; AVX512F-NEXT:    mulq %r14
+; AVX512F-NEXT:    movq %rdx, %r14
+; AVX512F-NEXT:    movq %rax, %r12
+; AVX512F-NEXT:    addq %r15, %r12
+; AVX512F-NEXT:    adcq $0, %r14
 ; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %rsi
-; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    movq %rdx, %r15
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r13, %r10
-; AVX512F-NEXT:    adcq %r15, %r12
+; AVX512F-NEXT:    addq %r12, %r10
+; AVX512F-NEXT:    adcq %r14, %r15
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %r15d
+; AVX512F-NEXT:    movzbl %al, %r14d
 ; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %rsi
-; AVX512F-NEXT:    addq %r12, %rax
-; AVX512F-NEXT:    adcq %r15, %rdx
-; AVX512F-NEXT:    addq %r14, %rax
-; AVX512F-NEXT:    adcq %rbx, %rdx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT:    movq %r10, 24(%r12)
+; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    addq %r15, %rax
+; AVX512F-NEXT:    adcq %r14, %rdx
+; AVX512F-NEXT:    addq %rbx, %rax
+; AVX512F-NEXT:    adcq %rcx, %rdx
+; AVX512F-NEXT:    movq %r10, 24(%r13)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
 ; AVX512F-NEXT:    orq %rdx, %r10
 ; AVX512F-NEXT:    setne %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    sarq $63, %rsi
-; AVX512F-NEXT:    movq %r8, %r11
-; AVX512F-NEXT:    imulq %rsi, %r11
+; AVX512F-NEXT:    movq %rsi, %rcx
+; AVX512F-NEXT:    sarq $63, %rcx
+; AVX512F-NEXT:    movq %r9, %rbx
+; AVX512F-NEXT:    imulq %rcx, %rbx
 ; AVX512F-NEXT:    movq %r8, %rax
-; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    mulq %rcx
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    imulq %rbp, %rsi
-; AVX512F-NEXT:    addq %r11, %rsi
-; AVX512F-NEXT:    addq %rdx, %rsi
-; AVX512F-NEXT:    movq %rbp, %r11
-; AVX512F-NEXT:    sarq $63, %r11
-; AVX512F-NEXT:    movq %r11, %r14
-; AVX512F-NEXT:    imulq %r9, %r14
-; AVX512F-NEXT:    movq %r11, %rax
+; AVX512F-NEXT:    addq %rax, %rbx
+; AVX512F-NEXT:    addq %rdx, %rbx
+; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    sarq $63, %rax
+; AVX512F-NEXT:    movq %rax, %rcx
+; AVX512F-NEXT:    imulq %rsi, %rcx
 ; AVX512F-NEXT:    mulq %rdi
-; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    imulq %rdi, %r11
-; AVX512F-NEXT:    addq %r14, %r11
-; AVX512F-NEXT:    addq %rdx, %r11
-; AVX512F-NEXT:    addq %r10, %rbx
-; AVX512F-NEXT:    adcq %rsi, %r11
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    addq %rax, %rcx
+; AVX512F-NEXT:    addq %rdx, %rcx
+; AVX512F-NEXT:    addq %r10, %r11
+; AVX512F-NEXT:    adcq %rbx, %rcx
 ; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    movq %rax, %rbx
+; AVX512F-NEXT:    movq %rsi, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r8
-; AVX512F-NEXT:    movq %rax, %r15
-; AVX512F-NEXT:    addq %r10, %r15
+; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    addq %r10, %r14
 ; AVX512F-NEXT:    adcq $0, %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    mulq %r9
 ; AVX512F-NEXT:    movq %rdx, %rdi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r15, %r10
+; AVX512F-NEXT:    addq %r14, %r10
 ; AVX512F-NEXT:    adcq %r8, %rdi
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %esi
-; AVX512F-NEXT:    movq %r9, %rax
-; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    movzbl %al, %r8d
+; AVX512F-NEXT:    movq %rsi, %rax
+; AVX512F-NEXT:    mulq %r9
 ; AVX512F-NEXT:    addq %rdi, %rax
-; AVX512F-NEXT:    adcq %rsi, %rdx
-; AVX512F-NEXT:    addq %rbx, %rax
-; AVX512F-NEXT:    adcq %r11, %rdx
-; AVX512F-NEXT:    movq %r10, 8(%r12)
+; AVX512F-NEXT:    adcq %r8, %rdx
+; AVX512F-NEXT:    addq %r11, %rax
+; AVX512F-NEXT:    adcq %rcx, %rdx
+; AVX512F-NEXT:    movq %r10, 8(%r13)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
@@ -3956,8 +3927,9 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    korw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT:    movq %rcx, 16(%r12)
-; AVX512F-NEXT:    movq %r14, (%r12)
+; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT:    movq %rax, 16(%r13)
+; AVX512F-NEXT:    movq %rbx, (%r13)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
 ; AVX512F-NEXT:    popq %r13
@@ -3974,113 +3946,104 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    movq %r9, %rbp
 ; AVX512BW-NEXT:    movq %rcx, %r11
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rsi, %r9
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512BW-NEXT:    movq %rcx, %r12
-; AVX512BW-NEXT:    sarq $63, %r12
-; AVX512BW-NEXT:    movq %r15, %rbx
-; AVX512BW-NEXT:    imulq %r12, %rbx
-; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    mulq %r12
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    imulq %rsi, %r12
-; AVX512BW-NEXT:    addq %rbx, %r12
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512BW-NEXT:    sarq $63, %rcx
+; AVX512BW-NEXT:    movq %rbp, %r12
+; AVX512BW-NEXT:    imulq %rcx, %r12
+; AVX512BW-NEXT:    movq %r14, %rax
+; AVX512BW-NEXT:    mulq %rcx
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    addq %rax, %r12
 ; AVX512BW-NEXT:    addq %rdx, %r12
-; AVX512BW-NEXT:    movq %rsi, %rbx
-; AVX512BW-NEXT:    sarq $63, %rbx
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    imulq %r11, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    sarq $63, %rax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    imulq %r11, %rcx
 ; AVX512BW-NEXT:    mulq %r10
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    imulq %r10, %rbx
-; AVX512BW-NEXT:    addq %r13, %rbx
-; AVX512BW-NEXT:    addq %rdx, %rbx
-; AVX512BW-NEXT:    addq %rcx, %r14
-; AVX512BW-NEXT:    adcq %r12, %rbx
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    addq %rdx, %rcx
+; AVX512BW-NEXT:    addq %r15, %rbx
+; AVX512BW-NEXT:    adcq %r12, %rcx
 ; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %r15
-; AVX512BW-NEXT:    movq %rdx, %r12
-; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %r15
+; AVX512BW-NEXT:    mulq %r14
 ; AVX512BW-NEXT:    movq %rdx, %r15
-; AVX512BW-NEXT:    movq %rax, %r13
-; AVX512BW-NEXT:    addq %r12, %r13
-; AVX512BW-NEXT:    adcq $0, %r15
+; AVX512BW-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    mulq %r14
+; AVX512BW-NEXT:    movq %rdx, %r14
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    addq %r15, %r12
+; AVX512BW-NEXT:    adcq $0, %r14
 ; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %rsi
-; AVX512BW-NEXT:    movq %rdx, %r12
+; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    movq %rdx, %r15
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r13, %r10
-; AVX512BW-NEXT:    adcq %r15, %r12
+; AVX512BW-NEXT:    addq %r12, %r10
+; AVX512BW-NEXT:    adcq %r14, %r15
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    movzbl %al, %r14d
 ; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %rsi
-; AVX512BW-NEXT:    addq %r12, %rax
-; AVX512BW-NEXT:    adcq %r15, %rdx
-; AVX512BW-NEXT:    addq %r14, %rax
-; AVX512BW-NEXT:    adcq %rbx, %rdx
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512BW-NEXT:    movq %r10, 24(%r12)
+; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    addq %r15, %rax
+; AVX512BW-NEXT:    adcq %r14, %rdx
+; AVX512BW-NEXT:    addq %rbx, %rax
+; AVX512BW-NEXT:    adcq %rcx, %rdx
+; AVX512BW-NEXT:    movq %r10, 24(%r13)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
 ; AVX512BW-NEXT:    orq %rdx, %r10
 ; AVX512BW-NEXT:    setne %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    movq %r9, %rsi
-; AVX512BW-NEXT:    sarq $63, %rsi
-; AVX512BW-NEXT:    movq %r8, %r11
-; AVX512BW-NEXT:    imulq %rsi, %r11
+; AVX512BW-NEXT:    movq %rsi, %rcx
+; AVX512BW-NEXT:    sarq $63, %rcx
+; AVX512BW-NEXT:    movq %r9, %rbx
+; AVX512BW-NEXT:    imulq %rcx, %rbx
 ; AVX512BW-NEXT:    movq %r8, %rax
-; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    mulq %rcx
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    imulq %rbp, %rsi
-; AVX512BW-NEXT:    addq %r11, %rsi
-; AVX512BW-NEXT:    addq %rdx, %rsi
-; AVX512BW-NEXT:    movq %rbp, %r11
-; AVX512BW-NEXT:    sarq $63, %r11
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    imulq %r9, %r14
-; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    addq %rax, %rbx
+; AVX512BW-NEXT:    addq %rdx, %rbx
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    sarq $63, %rax
+; AVX512BW-NEXT:    movq %rax, %rcx
+; AVX512BW-NEXT:    imulq %rsi, %rcx
 ; AVX512BW-NEXT:    mulq %rdi
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    imulq %rdi, %r11
-; AVX512BW-NEXT:    addq %r14, %r11
-; AVX512BW-NEXT:    addq %rdx, %r11
-; AVX512BW-NEXT:    addq %r10, %rbx
-; AVX512BW-NEXT:    adcq %rsi, %r11
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    addq %rax, %rcx
+; AVX512BW-NEXT:    addq %rdx, %rcx
+; AVX512BW-NEXT:    addq %r10, %r11
+; AVX512BW-NEXT:    adcq %rbx, %rcx
 ; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    movq %rsi, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    addq %r10, %r15
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    addq %r10, %r14
 ; AVX512BW-NEXT:    adcq $0, %r8
 ; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    mulq %r9
 ; AVX512BW-NEXT:    movq %rdx, %rdi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r15, %r10
+; AVX512BW-NEXT:    addq %r14, %r10
 ; AVX512BW-NEXT:    adcq %r8, %rdi
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %esi
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    movzbl %al, %r8d
+; AVX512BW-NEXT:    movq %rsi, %rax
+; AVX512BW-NEXT:    mulq %r9
 ; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    adcq %rsi, %rdx
-; AVX512BW-NEXT:    addq %rbx, %rax
-; AVX512BW-NEXT:    adcq %r11, %rdx
-; AVX512BW-NEXT:    movq %r10, 8(%r12)
+; AVX512BW-NEXT:    adcq %r8, %rdx
+; AVX512BW-NEXT:    addq %r11, %rax
+; AVX512BW-NEXT:    adcq %rcx, %rdx
+; AVX512BW-NEXT:    movq %r10, 8(%r13)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
@@ -4092,8 +4055,9 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    korw %k0, %k1, %k1
 ; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    movq %rcx, 16(%r12)
-; AVX512BW-NEXT:    movq %r14, (%r12)
+; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT:    movq %rax, 16(%r13)
+; AVX512BW-NEXT:    movq %rbx, (%r13)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
 ; AVX512BW-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 8d68303300ec6..cbbe089c80192 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -212,68 +212,66 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    imull %ecx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    imull %ecx, %edi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ebx, %esi
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %eax, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull %ebp, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebx, %esi
 ; WIN32-NEXT:    adcl $0, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    adcl %edi, %ebx
-; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl %cl, %ecx
-; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ecx, %edx
+; WIN32-NEXT:    movl %ebx, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebp, 4(%eax)
+; WIN32-NEXT:    movl %ebx, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -572,65 +570,65 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    imull %ecx, %edi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edx
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    imull %edx, %ecx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %edx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %edi, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    imull %edi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
 ; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    addl %eax, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    adcl %ecx, %esi
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %esi, %edi
-; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
+; WIN32-NEXT:    adcl $0, %ebx
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    adcl %ecx, %ebp
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    adcl %ebx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    xorl %esi, %edx
-; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    xorl %ebp, %edx
+; WIN32-NEXT:    xorl %eax, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    orl %edx, %esi
+; WIN32-NEXT:    orl %edx, %ebp
 ; WIN32-NEXT:    jne LBB12_2
 ; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:  LBB12_2:
-; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebx, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -991,57 +989,54 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %edi
-; WIN32-NEXT:    addl %esi, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    imull %ecx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
 ; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %eax, %esi
 ; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %edi, %ebp
+; WIN32-NEXT:    adcl $0, %esi
 ; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ecx, %edi
-; WIN32-NEXT:    adcl %ebp, %ebx
-; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    addl %ebp, %edi
+; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl %cl, %ecx
-; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    xorl %edi, %edx
 ; WIN32-NEXT:    xorl %eax, %edi
@@ -1050,7 +1045,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
 ; WIN32-NEXT:  LBB18_2: # %overflow
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1699,69 +1694,68 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %esi
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    imull %ebp, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    imull %esi, %ebx
-; WIN32-NEXT:    addl %ecx, %ebx
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl (%eax), %edx
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %eax, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    imull %edi, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    imull %ecx, %edi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ebx, %ecx
+; WIN32-NEXT:    addl %eax, %ebp
+; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ebx, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    adcl %ebp, %ebx
-; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    adcl %ebp, %esi
+; WIN32-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movzbl (%esp), %esi # 1-byte Folded Reload
+; WIN32-NEXT:    adcl %esi, %edx
 ; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    movl %ebx, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    movl %ebx, 4(%eax)
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $20, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1810,62 +1804,58 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %ebx
 ; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %esi, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    movl %ebp, %esi
 ; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    imull %ecx, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    imull %ecx, %edi
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %eax, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl %esi, %eax
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    adcl %edi, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %ebp, %esi
 ; WIN32-NEXT:    adcl $0, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    adcl %edi, %ebx
-; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl %cl, %ecx
-; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ecx, %edx
+; WIN32-NEXT:    movl %ebx, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebp, 4(%eax)
-; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl %ebx, 4(%eax)
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    addl $16, %esp


        


More information about the llvm-commits mailing list