[llvm] 30eff7f - [DAG] Attempt to replace a mul node with an existing umul_lohi/smul_lohi node (PR59217)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 29 04:51:41 PST 2022
Author: Simon Pilgrim
Date: 2022-11-29T12:51:30Z
New Revision: 30eff7f29f97599a94a40907f5b77244af0eaee1
URL: https://github.com/llvm/llvm-project/commit/30eff7f29f97599a94a40907f5b77244af0eaee1
DIFF: https://github.com/llvm/llvm-project/commit/30eff7f29f97599a94a40907f5b77244af0eaee1.diff
LOG: [DAG] Attempt to replace a mul node with an existing umul_lohi/smul_lohi node (PR59217)
As discussed on Issue #59217, under certain circumstances the DAG can generate duplicate MUL and MUL_LOHI nodes, often during MULO legalization.
This patch attempts to replace MUL nodes with additional uses of the LO result from the MUL_LOHI node
Differential Revision: https://reviews.llvm.org/D138790
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
llvm/test/CodeGen/X86/combine-mul.ll
llvm/test/CodeGen/X86/muloti.ll
llvm/test/CodeGen/X86/smul-with-overflow.ll
llvm/test/CodeGen/X86/smul_fix_sat.ll
llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/xmulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index fd1259b693bc0..9970ba20610ba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4034,6 +4034,21 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
getShiftAmountTy(N0.getValueType()))));
}
+ // Attempt to reuse an existing umul_lohi/smul_lohi node, but only if the
+ // hi result is in use in case we hit this mid-legalization.
+ for (unsigned LoHiOpc : {ISD::UMUL_LOHI, ISD::SMUL_LOHI}) {
+ if (!LegalOperations || TLI.isOperationLegalOrCustom(LoHiOpc, VT)) {
+ SDVTList LoHiVT = DAG.getVTList(VT, VT);
+ // TODO: Can we match commutable operands with getNodeIfExists?
+ if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N0, N1}))
+ if (LoHi->hasAnyUseOfValue(1))
+ return SDValue(LoHi, 0);
+ if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N1, N0}))
+ if (LoHi->hasAnyUseOfValue(1))
+ return SDValue(LoHi, 0);
+ }
+ }
+
// Try to transform:
// (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
// mul x, (2^N + 1) --> add (shl x, N), x
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 0a62c42969bb9..fdcceea353bcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -34,21 +34,19 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_mov_b32_e32 v5, v0
; GFX9-NEXT: v_mov_b32_e32 v4, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v10, v1
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v3, 0
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, v2
-; GFX9-NEXT: v_mul_lo_u32 v5, v5, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v5, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v2, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v9, v7
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v8, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_add3_u32 v1, v1, v5, v4
+; GFX9-NEXT: v_add3_u32 v1, v1, v5, v7
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -60,19 +58,17 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v4, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v2, 0
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v5, v3, 0
-; GFX10-NEXT: v_mov_b32_e32 v8, v1
-; GFX10-NEXT: v_mul_lo_u32 v5, v5, v2
-; GFX10-NEXT: v_mul_lo_u32 v4, v4, v3
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_add3_u32 v1, v1, v4, v5
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9
-; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11
-; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v5, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v5, v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: v_add3_u32 v1, v1, v6, v8
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -85,23 +81,21 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
-; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
-; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v5, v3, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mov_b32_e32 v8, v1
-; GFX11-NEXT: v_mul_lo_u32 v5, v5, v2
-; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v1, v1, v4, v5
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v7, v10, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v12, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v11
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v5, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v5, v3, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: v_add3_u32 v1, v1, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -157,28 +151,26 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v3, 0
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v1
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v4, v3, 0
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v10, v6
+; GFX9-NEXT: v_mad_i64_i32 v[10:11], s[4:5], v4, v3, 0
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v6, v2
-; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v8
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v11, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v7, v2
+; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v9, vcc
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v6, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, v2
-; GFX9-NEXT: v_mul_lo_u32 v5, v5, v3
-; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v5
+; GFX9-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v4, vcc
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
-; GFX9-NEXT: v_add3_u32 v1, v1, v5, v4
+; GFX9-NEXT: v_add3_u32 v1, v1, v6, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, v4
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -195,21 +187,19 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v5, v2, 0
; GFX10-NEXT: v_mad_i64_i32 v[11:12], s4, v5, v3, 0
; GFX10-NEXT: v_mov_b32_e32 v8, v1
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
+; GFX10-NEXT: v_add3_u32 v1, v1, v6, v9
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v8, v5, v2
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9
-; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT: v_mul_lo_u32 v9, v4, v3
-; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
-; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v8, v9
+; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v7, v11
+; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX10-NEXT: v_add3_u32 v1, v1, v9, v8
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
@@ -231,36 +221,34 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0
; GFX11-NEXT: v_mad_i64_i32 v[11:12], null, v5, v3, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v8, v1
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
-; GFX11-NEXT: v_mul_lo_u32 v8, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v9
-; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v10, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v12, vcc_lo
-; GFX11-NEXT: v_mul_lo_u32 v9, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v11
+; GFX11-NEXT: v_add3_u32 v1, v1, v6, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, v6
; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v10, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, v11
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v6, v2
-; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v8, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5
-; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index b7b90b72d19ae..9bac05eccabd5 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -437,18 +437,14 @@ define i64 @combine_mul_umul_lohi_i64(i64 %a, i64 %b) {
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: mulq %rsi
-; SSE-NEXT: imulq %rsi, %rdi
-; SSE-NEXT: xorq %rdx, %rdi
-; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: xorq %rdx, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_umul_lohi_i64:
; AVX: # %bb.0:
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: imulq %rsi, %rdi
-; AVX-NEXT: xorq %rdx, %rdi
-; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: xorq %rdx, %rax
; AVX-NEXT: retq
%a128 = zext i64 %a to i128
%b128 = zext i64 %b to i128
@@ -465,18 +461,14 @@ define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: imulq %rsi
-; SSE-NEXT: imulq %rdi, %rsi
-; SSE-NEXT: xorq %rdx, %rsi
-; SSE-NEXT: movq %rsi, %rax
+; SSE-NEXT: xorq %rdx, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_smul_lohi_commute_i64:
; AVX: # %bb.0:
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: imulq %rsi
-; AVX-NEXT: imulq %rdi, %rsi
-; AVX-NEXT: xorq %rdx, %rsi
-; AVX-NEXT: movq %rsi, %rax
+; AVX-NEXT: xorq %rdx, %rax
; AVX-NEXT: retq
%a128 = sext i64 %a to i128
%b128 = sext i64 %b to i128
@@ -491,22 +483,18 @@ define i64 @combine_mul_smul_lohi_commute_i64(i64 %a, i64 %b) {
define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
; SSE-LABEL: combine_mul_umul_lohi_const_i64:
; SSE: # %bb.0:
-; SSE-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; SSE-NEXT: mulq %rcx
-; SSE-NEXT: imulq %rdi, %rcx
-; SSE-NEXT: xorq %rdx, %rcx
-; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: xorq %rdx, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_umul_lohi_const_i64:
; AVX: # %bb.0:
-; AVX-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; AVX-NEXT: mulq %rcx
-; AVX-NEXT: imulq %rdi, %rcx
-; AVX-NEXT: xorq %rdx, %rcx
-; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: xorq %rdx, %rax
; AVX-NEXT: retq
%h128 = zext i64 %h to i128
%m128 = mul nuw i128 %h128, 14181476777654086739
@@ -520,30 +508,26 @@ define i64 @combine_mul_umul_lohi_const_i64(i64 %h) {
define i64 @combine_mul_smul_lohi_const_i64(i64 %h) {
; SSE-LABEL: combine_mul_smul_lohi_const_i64:
; SSE: # %bb.0:
-; SSE-NEXT: movq %rdi, %rsi
-; SSE-NEXT: sarq $63, %rsi
-; SSE-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; SSE-NEXT: movq %rdi, %rax
-; SSE-NEXT: mulq %rcx
-; SSE-NEXT: imulq %rcx, %rsi
-; SSE-NEXT: addq %rdx, %rsi
-; SSE-NEXT: imulq %rdi, %rcx
-; SSE-NEXT: xorq %rsi, %rcx
-; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: sarq $63, %rcx
+; SSE-NEXT: movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
+; SSE-NEXT: mulq %rsi
+; SSE-NEXT: imulq %rsi, %rcx
+; SSE-NEXT: addq %rdx, %rcx
+; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: combine_mul_smul_lohi_const_i64:
; AVX: # %bb.0:
-; AVX-NEXT: movq %rdi, %rsi
-; AVX-NEXT: sarq $63, %rsi
-; AVX-NEXT: movabsq $-4265267296055464877, %rcx # imm = 0xC4CEB9FE1A85EC53
; AVX-NEXT: movq %rdi, %rax
-; AVX-NEXT: mulq %rcx
-; AVX-NEXT: imulq %rcx, %rsi
-; AVX-NEXT: addq %rdx, %rsi
-; AVX-NEXT: imulq %rdi, %rcx
-; AVX-NEXT: xorq %rsi, %rcx
-; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: sarq $63, %rcx
+; AVX-NEXT: movabsq $-4265267296055464877, %rsi # imm = 0xC4CEB9FE1A85EC53
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: imulq %rsi, %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: xorq %rcx, %rax
; AVX-NEXT: retq
%h128 = sext i64 %h to i128
%m128 = mul nsw i128 %h128, 14181476777654086739
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 60a2e21dcd03d..a184d49ce75a7 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -13,54 +13,51 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
-; CHECK-NEXT: movq %rdx, %r11
+; CHECK-NEXT: movq %rdx, %r10
; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: sarq $63, %rbx
-; CHECK-NEXT: movq %rdx, %rdi
-; CHECK-NEXT: imulq %rbx, %rdi
-; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: mulq %rbx
-; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: imulq %rcx, %rbx
-; CHECK-NEXT: addq %rdi, %rbx
-; CHECK-NEXT: addq %rdx, %rbx
-; CHECK-NEXT: movq %rcx, %rdi
+; CHECK-NEXT: movq %rsi, %rdi
; CHECK-NEXT: sarq $63, %rdi
-; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: movq %rcx, %r11
+; CHECK-NEXT: imulq %rdi, %r11
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: mulq %rdi
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: addq %rax, %r11
+; CHECK-NEXT: addq %rdx, %r11
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: sarq $63, %rax
+; CHECK-NEXT: movq %rax, %r14
; CHECK-NEXT: imulq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %r9
-; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: imulq %r9, %rdi
-; CHECK-NEXT: addq %r14, %rdi
-; CHECK-NEXT: addq %rdx, %rdi
-; CHECK-NEXT: addq %r8, %r10
-; CHECK-NEXT: adcq %rbx, %rdi
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: mulq %r11
-; CHECK-NEXT: movq %rdx, %rbx
; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: mulq %r11
+; CHECK-NEXT: addq %rax, %r14
+; CHECK-NEXT: addq %rdx, %r14
+; CHECK-NEXT: addq %rdi, %r8
+; CHECK-NEXT: adcq %r11, %r14
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: mulq %r10
; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: addq %rbx, %r14
-; CHECK-NEXT: adcq $0, %r11
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: mulq %r10
+; CHECK-NEXT: movq %rdx, %r10
+; CHECK-NEXT: movq %rax, %rbx
+; CHECK-NEXT: addq %r11, %rbx
+; CHECK-NEXT: adcq $0, %r10
; CHECK-NEXT: movq %r9, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: movq %rdx, %r11
; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: addq %r14, %r9
-; CHECK-NEXT: adcq %r11, %rbx
+; CHECK-NEXT: addq %rbx, %r9
+; CHECK-NEXT: adcq %r10, %r11
; CHECK-NEXT: setb %al
-; CHECK-NEXT: movzbl %al, %r11d
+; CHECK-NEXT: movzbl %al, %r10d
; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: addq %rbx, %rax
-; CHECK-NEXT: adcq %r11, %rdx
-; CHECK-NEXT: addq %r10, %rax
-; CHECK-NEXT: adcq %rdi, %rdx
+; CHECK-NEXT: addq %r11, %rax
+; CHECK-NEXT: adcq %r10, %rdx
+; CHECK-NEXT: addq %r8, %rax
+; CHECK-NEXT: adcq %r14, %rdx
; CHECK-NEXT: movq %r9, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: xorq %rcx, %rdx
@@ -68,7 +65,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: orq %rdx, %rcx
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %nooverflow
-; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq %r9, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 83b9460c7dae3..6d8b83824a6d5 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,7 +191,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $184, %esp
+; X86-NEXT: subl $192, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: negl %eax
@@ -199,172 +199,139 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: negl %eax
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb %bl
; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: setb %al
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
@@ -372,40 +339,71 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -413,411 +411,401 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %esi
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl (%esp), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: setb %al
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %al, %ebp
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movzbl %cl, %ebx
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: setb %dl
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movzbl %dl, %esi
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movzbl %bl, %edi
+; X86-NEXT: adcl %edx, %edi
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: setb %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: setb %al
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: setb %cl
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %al, %ebp
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: setb %bl
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %eax, %edx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movzbl %bl, %edi
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %eax, %eax
; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movzbl %cl, %edx
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: setb %al
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: setb %al
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %al, %ebp
+; X86-NEXT: adcl %ebx, %ebp
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %edx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edx, %ecx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: imull %edx, %ebp
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: orl %ebx, %edi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: xorl %edx, %ebp
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: orl %ebp, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: andl $1, %edx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: negl %eax
-; X86-NEXT: xorl %eax, %ebp
+; X86-NEXT: xorl %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: orl %ebp, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: xorl %eax, %ebx
; X86-NEXT: orl %esi, %ebx
; X86-NEXT: xorl %edi, %eax
; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 8(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movb %dl, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, (%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movb %cl, 16(%eax)
; X86-NEXT: setne 20(%eax)
-; X86-NEXT: addl $184, %esp
+; X86-NEXT: addl $192, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -832,189 +820,170 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %r12
-; X64-NEXT: movq %rcx, %r15
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: movq %r8, %r13
+; X64-NEXT: movq %rcx, %r10
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rsi, %rbx
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT: andl $1, %esi
-; X64-NEXT: negq %rsi
-; X64-NEXT: andl $1, %r15d
-; X64-NEXT: negq %r15
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: negq %r11
+; X64-NEXT: andl $1, %r10d
+; X64-NEXT: negq %r10
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rax, %r12
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rax, %rbx
+; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: adcq %rdx, %r8
+; X64-NEXT: setb %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: addq %rax, %r8
; X64-NEXT: adcq %rdx, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movzbl %dil, %r13d
-; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: adcq %rdx, %r13
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rdi, %r11
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %r14, %rbp
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: addq %rbp, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %rdi
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: adcq %r13, %rsi
+; X64-NEXT: setb %bpl
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movzbl %r11b, %edx
-; X64-NEXT: adcq %rdx, %r10
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: adcq %rbx, %r10
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: movzbl %bpl, %edx
+; X64-NEXT: adcq %rdx, %r14
+; X64-NEXT: addq %r12, %rax
+; X64-NEXT: movq %r12, %r9
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: adcq %rdi, %r14
+; X64-NEXT: adcq $0, %r8
; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r13, %r15
+; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %r8, %rbp
-; X64-NEXT: adcq %rax, %r11
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: addq %rbx, %r15
+; X64-NEXT: adcq %r13, %rbp
; X64-NEXT: setb %al
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: movzbl %al, %r14d
-; X64-NEXT: adcq %rdx, %r14
-; X64-NEXT: addq %r8, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %rbp
-; X64-NEXT: adcq $0, %r11
+; X64-NEXT: addq %rdi, %rbp
+; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: adcq %rdx, %r12
+; X64-NEXT: addq %rbx, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r15
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: adcq $0, %r12
+; X64-NEXT: addq %r8, %rbp
+; X64-NEXT: adcq %rcx, %r12
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rdx, %r8
+; X64-NEXT: movq %rdx, %r14
; X64-NEXT: adcq $0, %r14
-; X64-NEXT: addq %rcx, %r11
-; X64-NEXT: adcq %r13, %r14
+; X64-NEXT: addq %rax, %r8
+; X64-NEXT: adcq %rdx, %r14
; X64-NEXT: setb %dil
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: addq %rax, %rbx
-; X64-NEXT: adcq %rdx, %r10
-; X64-NEXT: setb %cl
-; X64-NEXT: addq %rax, %r10
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: adcq %rdx, %rcx
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: adcq %r14, %rbx
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: addq %rax, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: movzbl %dil, %esi
+; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: addq %rax, %rbp
+; X64-NEXT: adcq %r12, %r8
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: addq %r9, %r14
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %r9, %rdi
; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq %rax, %rdx
; X64-NEXT: setb %al
-; X64-NEXT: addq %rdi, %rcx
-; X64-NEXT: movzbl %al, %edi
-; X64-NEXT: adcq %r8, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %r15, %rax
-; X64-NEXT: imulq %r15, %r12
-; X64-NEXT: addq %rax, %r12
-; X64-NEXT: addq %rdx, %r12
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: imulq %rsi
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r9, %r8
-; X64-NEXT: addq %rax, %r8
+; X64-NEXT: addq %rsi, %rcx
+; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: imulq %r11
+; X64-NEXT: movq %r9, %r11
+; X64-NEXT: addq %rax, %r11
+; X64-NEXT: movq %rdi, %r12
; X64-NEXT: adcq %rdx, %r12
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq %rdi, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: movq %r13, %r15
+; X64-NEXT: addq %rcx, %r11
+; X64-NEXT: adcq %rsi, %r12
+; X64-NEXT: movq %rbx, %r10
+; X64-NEXT: addq %r13, %r10
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: addq %rdi, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: adcq %rdx, %rcx
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: adcq %rsi, %r13
; X64-NEXT: setb %r9b
-; X64-NEXT: addq %rdi, %rcx
-; X64-NEXT: movzbl %r9b, %edi
-; X64-NEXT: adcq %rdx, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: imulq %rsi, %rdx
-; X64-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: addq %rcx, %r13
+; X64-NEXT: movzbl %r9b, %ecx
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: adcq %r10, %rdx
; X64-NEXT: addq %r13, %rax
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %r15
-; X64-NEXT: adcq %r8, %rax
-; X64-NEXT: adcq %r12, %rsi
-; X64-NEXT: addq %r11, %r13
-; X64-NEXT: adcq %rbx, %r15
-; X64-NEXT: adcq %r10, %rax
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: movq %rbp, %rcx
+; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: adcq %r11, %rax
+; X64-NEXT: adcq %r12, %rdx
+; X64-NEXT: addq %rbp, %rbx
+; X64-NEXT: adcq %r8, %r10
+; X64-NEXT: adcq %r14, %rax
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: movq %r15, %rcx
; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: xorq %rcx, %rsi
-; X64-NEXT: xorq %rcx, %r15
-; X64-NEXT: orq %rsi, %r15
+; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: xorq %rcx, %r10
+; X64-NEXT: orq %rdx, %r10
; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: orq %r15, %rax
-; X64-NEXT: xorq %r13, %rcx
+; X64-NEXT: orq %r10, %rax
+; X64-NEXT: xorq %rbx, %rcx
; X64-NEXT: orq %rax, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movl %eax, %esi
; X64-NEXT: andl $1, %esi
; X64-NEXT: movq %rsi, %rdx
; X64-NEXT: negq %rdx
-; X64-NEXT: xorq %rdx, %rbp
+; X64-NEXT: xorq %rdx, %r15
; X64-NEXT: xorq %rax, %rdx
-; X64-NEXT: orq %rbp, %rdx
+; X64-NEXT: orq %r15, %rdx
; X64-NEXT: orq %rcx, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 0532916b1e4ca..fc5a54b3cf4ce 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -376,68 +376,65 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: imull %edx, %ecx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: mull %ebx
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sarl $31, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %ebp, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: imull %ecx, %edx
+; X86-NEXT: imull %ebx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: imull %ebp, %edi
-; X86-NEXT: addl %edx, %edi
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: xorl %ebp, %ecx
-; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: notl %ecx
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: cmovel %ebx, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: xorl %ebp, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: notl %ebx
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: cmovel %ecx, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: addl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index fc40c539f37c7..7113aaf7e83ed 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -14,61 +14,58 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rdi, %r10
-; X64-NEXT: movq %rsi, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: imulq %r14, %rdi
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: imulq %rcx, %r14
-; X64-NEXT: addq %rdi, %r14
-; X64-NEXT: addq %rdx, %r14
-; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: movq %rsi, %rdi
; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: imulq %rdi, %rbx
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rax, %rbx
+; X64-NEXT: addq %rdx, %rbx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: movq %rax, %r15
; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: imulq %r10, %rdi
-; X64-NEXT: addq %r15, %rdi
-; X64-NEXT: addq %rdx, %rdi
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: adcq %r14, %rdi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: addq %rax, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: addq %rdi, %r9
+; X64-NEXT: adcq %rbx, %r15
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r14, %r15
-; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rbx, %r14
+; X64-NEXT: adcq $0, %r11
; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r15, %r10
-; X64-NEXT: adcq %rbx, %r14
+; X64-NEXT: addq %r14, %r10
+; X64-NEXT: adcq %r11, %rbx
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %r11d
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: adcq %rbx, %rdx
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: adcq %r11, %rdx
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: adcq %r15, %rdx
; X64-NEXT: movq %r10, 8(%r8)
; X64-NEXT: sarq $63, %r10
; X64-NEXT: xorq %r10, %rdx
; X64-NEXT: xorq %rax, %r10
; X64-NEXT: orq %rdx, %r10
; X64-NEXT: setne %al
-; X64-NEXT: movq %r9, (%r8)
+; X64-NEXT: movq %rdi, (%r8)
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
@@ -84,8 +81,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $52, %esp
-; X86-NEXT: .cfi_def_cfa_offset 72
+; X86-NEXT: subl $60, %esp
+; X86-NEXT: .cfi_def_cfa_offset 80
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -102,19 +99,20 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb %cl
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -126,197 +124,181 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: setb %bl
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %esi, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: imull %esi, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %edi, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: imull %eax, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sarl $31, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: imull %esi, %ebx
-; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: imull %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: addl %edx, %ecx
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: xorl %esi, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: xorl %eax, %ebp
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: orl %ebp, %eax
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -324,7 +306,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $52, %esp
+; X86-NEXT: addl $60, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -358,232 +340,214 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %r8, %rbx
-; X64-NEXT: movq %rcx, %r11
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rsi, %r11
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rax, %r10
; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rsi, %rbx
; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r8, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rbx, %r14
; X64-NEXT: adcq %rcx, %r12
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %ecx
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r12, %r14
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r12, %rsi
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r8, %r13
+; X64-NEXT: addq %rbx, %r13
; X64-NEXT: adcq $0, %r12
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %r9, %rsi
; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r12, %r11
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r15, %r9
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r11, %r8
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT: adcq %r10, %rbp
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: adcq %r12, %rbx
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: addq %rbx, %rbp
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: adcq %r14, %r13
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, %rsi
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %r11, %rbx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r11, %r9
-; X64-NEXT: adcq $0, %r13
+; X64-NEXT: addq %r8, %r9
+; X64-NEXT: adcq $0, %r10
; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rdx, %r11
; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: adcq %r13, %r10
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: adcq %r10, %r11
; X64-NEXT: setb %cl
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r10, %r13
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %r11, %r8
; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: addq %r8, %rdi
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: addq %rbp, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r13, %rdi
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rbp, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: addq %r14, %r13
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Folded Reload
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
+; X64-NEXT: setb %cl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rsi, %r9
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: adcq %rdi, %r9
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: adcq %rdi, %r11
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r9, %r14
-; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r11, %r13
+; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: addq %r13, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r11, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r10, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r13
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r10, %r13
-; X64-NEXT: sarq $63, %r13
-; X64-NEXT: movq %r13, %rcx
-; X64-NEXT: imulq %r12, %rcx
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: imulq %r13, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: addq %rdx, %r15
-; X64-NEXT: movq %r13, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT: imulq %rsi, %rcx
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: imulq %r13, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %r15, %r8
+; X64-NEXT: sarq $63, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: addq %rax, %r8
-; X64-NEXT: adcq %r15, %rbx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r9, %r15
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: addq %r11, %r15
-; X64-NEXT: adcq %r9, %r13
-; X64-NEXT: setb %cl
-; X64-NEXT: addq %rax, %r13
-; X64-NEXT: movzbl %cl, %r9d
-; X64-NEXT: adcq %rdx, %r9
-; X64-NEXT: addq %r8, %r13
-; X64-NEXT: adcq %rbx, %r9
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %r9, %r10
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: addq %rsi, %r10
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r9, %r14
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: imulq %r12, %r9
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq {{[0-9]+}}(%rsp)
+; X64-NEXT: addq %rax, %r9
+; X64-NEXT: addq %rdx, %r9
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %r10, %r9
+; X64-NEXT: addq %r11, %r14
+; X64-NEXT: movzbl %sil, %edi
+; X64-NEXT: adcq %rcx, %rdi
+; X64-NEXT: addq %rax, %r14
+; X64-NEXT: adcq %r9, %rdi
; X64-NEXT: sarq $63, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: imulq %r12, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: imulq %r12, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT: movq %r8, %rdx
-; X64-NEXT: imulq %r12, %rdx
-; X64-NEXT: imulq %r12, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: movq %r10, %rcx
; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: addq %r10, %rbx
-; X64-NEXT: addq %rsi, %r8
-; X64-NEXT: adcq %rbx, %rcx
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: addq %r10, %rbx
-; X64-NEXT: adcq $0, %r10
+; X64-NEXT: adcq $0, %r11
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: addq %rax, %rbx
-; X64-NEXT: adcq %rdx, %r10
-; X64-NEXT: setb %r12b
-; X64-NEXT: addq %rax, %r10
-; X64-NEXT: movzbl %r12b, %eax
-; X64-NEXT: adcq %rdx, %rax
-; X64-NEXT: addq %r8, %r10
-; X64-NEXT: adcq %rcx, %rax
-; X64-NEXT: addq %r11, %rsi
-; X64-NEXT: adcq %r15, %rbx
-; X64-NEXT: adcq %r13, %r10
-; X64-NEXT: adcq %r9, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: setb %bl
+; X64-NEXT: imulq %r12, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: mulq %r12
+; X64-NEXT: addq %rax, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: addq %rsi, %rax
+; X64-NEXT: adcq %rcx, %r15
+; X64-NEXT: addq %r9, %r11
+; X64-NEXT: movzbl %bl, %edx
+; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: addq %rax, %r11
+; X64-NEXT: adcq %r15, %rdx
; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %r10
-; X64-NEXT: adcq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT: movq %rcx, %rdx
-; X64-NEXT: sarq $63, %rdx
-; X64-NEXT: xorq %rdx, %rax
-; X64-NEXT: xorq %rdx, %rbx
-; X64-NEXT: orq %rax, %rbx
-; X64-NEXT: xorq %rdx, %r10
-; X64-NEXT: xorq %rsi, %rdx
-; X64-NEXT: orq %r10, %rdx
-; X64-NEXT: orq %rbx, %rdx
+; X64-NEXT: adcq %r10, %rcx
+; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %r11
+; X64-NEXT: adcq %rbp, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: xorq %rax, %rdx
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: xorq %rax, %r11
+; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: orq %r11, %rax
+; X64-NEXT: orq %rcx, %rax
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %rcx, 24(%rax)
+; X64-NEXT: movq %rdi, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -609,8 +573,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $152, %esp
-; X86-NEXT: .cfi_def_cfa_offset 172
+; X86-NEXT: subl $156, %esp
+; X86-NEXT: .cfi_def_cfa_offset 176
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -644,66 +608,65 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %esi
@@ -772,43 +735,43 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb %cl
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %esi, %ebx
@@ -816,25 +779,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
@@ -857,83 +820,80 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %esi, %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
@@ -942,67 +902,68 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl %ebp, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: adcl %eax, %ebp
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: adcl $0, %edx
@@ -1015,9 +976,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %esi
@@ -1030,131 +991,132 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
@@ -1165,369 +1127,335 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: setb %al
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: setb %dl
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %dl, %edx
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %ebp, %ebx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: addl %ebp, %edi
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: imull %ecx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: imull %edi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: mull %ecx
; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %edi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %edi, %edx
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movzbl %bl, %edi
; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movzbl %bl, %ebx
+; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: setb %cl
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: setb %al
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %ebp, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: setb %al
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %esi, %eax
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %esi, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: setb %dl
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %al
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %dl, %ecx
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: imull %ebx, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: imull %ebx, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %esi
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: xorl %esi, %edi
-; X86-NEXT: xorl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: xorl %esi, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: xorl %esi, %edx
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: xorl %esi, %ebx
-; X86-NEXT: xorl %esi, %ecx
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: xorl %eax, %ebp
+; X86-NEXT: orl %edx, %ebp
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: orl %ebp, %eax
; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 28(%eax)
+; X86-NEXT: movl %ebx, 28(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1543,7 +1471,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 24(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $152, %esp
+; X86-NEXT: addl $156, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 7bb7c9b481d39..fef44e062fd3b 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,112 +3297,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %r8, %r14
+; SSE2-NEXT: movq %rcx, %r13
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rsi, %r11
; SSE2-NEXT: movq %rdi, %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: movq %r11, %r12
-; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: movq %r14, %rbx
-; SSE2-NEXT: imulq %r12, %rbx
+; SSE2-NEXT: movq %r11, %rcx
+; SSE2-NEXT: sarq $63, %rcx
+; SSE2-NEXT: movq %r9, %r15
+; SSE2-NEXT: imulq %rcx, %r15
; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: mulq %r12
+; SSE2-NEXT: mulq %rcx
; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: imulq %r9, %r12
-; SSE2-NEXT: addq %rbx, %r12
-; SSE2-NEXT: addq %rdx, %r12
-; SSE2-NEXT: movq %r9, %rbx
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rbx, %r13
-; SSE2-NEXT: imulq %r11, %r13
-; SSE2-NEXT: movq %rbx, %rax
+; SSE2-NEXT: addq %rax, %r15
+; SSE2-NEXT: addq %rdx, %r15
+; SSE2-NEXT: movq %r9, %rax
+; SSE2-NEXT: sarq $63, %rax
+; SSE2-NEXT: movq %rax, %rcx
+; SSE2-NEXT: imulq %r11, %rcx
; SSE2-NEXT: mulq %r10
-; SSE2-NEXT: movq %rax, %r15
-; SSE2-NEXT: imulq %r10, %rbx
-; SSE2-NEXT: addq %r13, %rbx
-; SSE2-NEXT: addq %rdx, %rbx
-; SSE2-NEXT: addq %rdi, %r15
-; SSE2-NEXT: adcq %r12, %rbx
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: addq %rax, %rcx
+; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: addq %rdi, %rbx
+; SSE2-NEXT: adcq %r15, %rcx
; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r14
-; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: movq %rdx, %r15
; SSE2-NEXT: movq %rax, %rdi
; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r14
; SSE2-NEXT: movq %rdx, %r14
-; SSE2-NEXT: movq %rax, %r13
-; SSE2-NEXT: addq %r12, %r13
+; SSE2-NEXT: movq %rax, %r12
+; SSE2-NEXT: addq %r15, %r12
; SSE2-NEXT: adcq $0, %r14
; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: movq %rdx, %r15
; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: addq %r13, %r10
-; SSE2-NEXT: adcq %r14, %r12
+; SSE2-NEXT: addq %r12, %r10
+; SSE2-NEXT: adcq %r14, %r15
; SSE2-NEXT: setb %al
; SSE2-NEXT: movzbl %al, %r14d
; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: addq %r12, %rax
-; SSE2-NEXT: adcq %r14, %rdx
; SSE2-NEXT: addq %r15, %rax
-; SSE2-NEXT: adcq %rbx, %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT: movq %r10, 8(%r12)
+; SSE2-NEXT: adcq %r14, %rdx
+; SSE2-NEXT: addq %rbx, %rax
+; SSE2-NEXT: adcq %rcx, %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT: movq %r10, 8(%r15)
; SSE2-NEXT: sarq $63, %r10
; SSE2-NEXT: xorq %r10, %rdx
; SSE2-NEXT: xorq %rax, %r10
-; SSE2-NEXT: xorl %r15d, %r15d
+; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: orq %rdx, %r10
-; SSE2-NEXT: setne %r15b
-; SSE2-NEXT: movq %rcx, %rbx
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rsi, %r10
-; SSE2-NEXT: imulq %rbx, %r10
+; SSE2-NEXT: setne %cl
+; SSE2-NEXT: movq %r13, %r9
+; SSE2-NEXT: sarq $63, %r9
+; SSE2-NEXT: movq %rbp, %r11
+; SSE2-NEXT: imulq %r9, %r11
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: mulq %rbx
+; SSE2-NEXT: mulq %r9
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: imulq %rbp, %rbx
-; SSE2-NEXT: addq %r10, %rbx
-; SSE2-NEXT: addq %rdx, %rbx
-; SSE2-NEXT: movq %rbp, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: movq %r10, %r14
-; SSE2-NEXT: imulq %rcx, %r14
-; SSE2-NEXT: movq %r10, %rax
+; SSE2-NEXT: addq %rax, %r11
+; SSE2-NEXT: addq %rdx, %r11
+; SSE2-NEXT: movq %rbp, %rax
+; SSE2-NEXT: sarq $63, %rax
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: imulq %r13, %r14
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: imulq %r8, %r10
-; SSE2-NEXT: addq %r14, %r10
-; SSE2-NEXT: addq %rdx, %r10
-; SSE2-NEXT: addq %r9, %r11
-; SSE2-NEXT: adcq %rbx, %r10
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: addq %rax, %r14
+; SSE2-NEXT: addq %rdx, %r14
+; SSE2-NEXT: addq %r9, %r10
+; SSE2-NEXT: adcq %r11, %r14
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: movq %r13, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: addq %r9, %r14
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: addq %r9, %rbx
; SSE2-NEXT: adcq $0, %rsi
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %r14, %r9
+; SSE2-NEXT: addq %rbx, %r9
; SSE2-NEXT: adcq %rsi, %r8
; SSE2-NEXT: setb %al
; SSE2-NEXT: movzbl %al, %esi
-; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %r13, %rax
; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: addq %r8, %rax
; SSE2-NEXT: adcq %rsi, %rdx
-; SSE2-NEXT: addq %r11, %rax
-; SSE2-NEXT: adcq %r10, %rdx
-; SSE2-NEXT: movq %r9, 24(%r12)
+; SSE2-NEXT: addq %r10, %rax
+; SSE2-NEXT: adcq %r14, %rdx
+; SSE2-NEXT: movq %r9, 24(%r15)
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: xorq %r9, %rdx
; SSE2-NEXT: xorq %rax, %r9
@@ -3411,11 +3406,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: setne %al
; SSE2-NEXT: negl %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: negl %r15d
-; SSE2-NEXT: movd %r15d, %xmm0
+; SSE2-NEXT: negl %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rbx, 16(%r12)
-; SSE2-NEXT: movq %rdi, (%r12)
+; SSE2-NEXT: movq %r11, 16(%r15)
+; SSE2-NEXT: movq %rdi, (%r15)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -3433,112 +3428,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movq %r8, %r14
+; SSSE3-NEXT: movq %rcx, %r13
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rsi, %r11
; SSSE3-NEXT: movq %rdi, %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT: movq %r11, %r12
-; SSSE3-NEXT: sarq $63, %r12
-; SSSE3-NEXT: movq %r14, %rbx
-; SSSE3-NEXT: imulq %r12, %rbx
+; SSSE3-NEXT: movq %r11, %rcx
+; SSSE3-NEXT: sarq $63, %rcx
+; SSSE3-NEXT: movq %r9, %r15
+; SSSE3-NEXT: imulq %rcx, %r15
; SSSE3-NEXT: movq %r14, %rax
-; SSSE3-NEXT: mulq %r12
+; SSSE3-NEXT: mulq %rcx
; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: imulq %r9, %r12
-; SSSE3-NEXT: addq %rbx, %r12
-; SSSE3-NEXT: addq %rdx, %r12
-; SSSE3-NEXT: movq %r9, %rbx
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rbx, %r13
-; SSSE3-NEXT: imulq %r11, %r13
-; SSSE3-NEXT: movq %rbx, %rax
+; SSSE3-NEXT: addq %rax, %r15
+; SSSE3-NEXT: addq %rdx, %r15
+; SSSE3-NEXT: movq %r9, %rax
+; SSSE3-NEXT: sarq $63, %rax
+; SSSE3-NEXT: movq %rax, %rcx
+; SSSE3-NEXT: imulq %r11, %rcx
; SSSE3-NEXT: mulq %r10
-; SSSE3-NEXT: movq %rax, %r15
-; SSSE3-NEXT: imulq %r10, %rbx
-; SSSE3-NEXT: addq %r13, %rbx
-; SSSE3-NEXT: addq %rdx, %rbx
-; SSSE3-NEXT: addq %rdi, %r15
-; SSSE3-NEXT: adcq %r12, %rbx
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: addq %rax, %rcx
+; SSSE3-NEXT: addq %rdx, %rcx
+; SSSE3-NEXT: addq %rdi, %rbx
+; SSSE3-NEXT: adcq %r15, %rcx
; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r14
-; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: movq %rdx, %r15
; SSSE3-NEXT: movq %rax, %rdi
; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r14
; SSSE3-NEXT: movq %rdx, %r14
-; SSSE3-NEXT: movq %rax, %r13
-; SSSE3-NEXT: addq %r12, %r13
+; SSSE3-NEXT: movq %rax, %r12
+; SSSE3-NEXT: addq %r15, %r12
; SSSE3-NEXT: adcq $0, %r14
; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: movq %rdx, %r15
; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: addq %r13, %r10
-; SSSE3-NEXT: adcq %r14, %r12
+; SSSE3-NEXT: addq %r12, %r10
+; SSSE3-NEXT: adcq %r14, %r15
; SSSE3-NEXT: setb %al
; SSSE3-NEXT: movzbl %al, %r14d
; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: addq %r12, %rax
-; SSSE3-NEXT: adcq %r14, %rdx
; SSSE3-NEXT: addq %r15, %rax
-; SSSE3-NEXT: adcq %rbx, %rdx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSSE3-NEXT: movq %r10, 8(%r12)
+; SSSE3-NEXT: adcq %r14, %rdx
+; SSSE3-NEXT: addq %rbx, %rax
+; SSSE3-NEXT: adcq %rcx, %rdx
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT: movq %r10, 8(%r15)
; SSSE3-NEXT: sarq $63, %r10
; SSSE3-NEXT: xorq %r10, %rdx
; SSSE3-NEXT: xorq %rax, %r10
-; SSSE3-NEXT: xorl %r15d, %r15d
+; SSSE3-NEXT: xorl %ecx, %ecx
; SSSE3-NEXT: orq %rdx, %r10
-; SSSE3-NEXT: setne %r15b
-; SSSE3-NEXT: movq %rcx, %rbx
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rsi, %r10
-; SSSE3-NEXT: imulq %rbx, %r10
+; SSSE3-NEXT: setne %cl
+; SSSE3-NEXT: movq %r13, %r9
+; SSSE3-NEXT: sarq $63, %r9
+; SSSE3-NEXT: movq %rbp, %r11
+; SSSE3-NEXT: imulq %r9, %r11
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: mulq %rbx
+; SSSE3-NEXT: mulq %r9
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: imulq %rbp, %rbx
-; SSSE3-NEXT: addq %r10, %rbx
-; SSSE3-NEXT: addq %rdx, %rbx
-; SSSE3-NEXT: movq %rbp, %r10
-; SSSE3-NEXT: sarq $63, %r10
-; SSSE3-NEXT: movq %r10, %r14
-; SSSE3-NEXT: imulq %rcx, %r14
-; SSSE3-NEXT: movq %r10, %rax
+; SSSE3-NEXT: addq %rax, %r11
+; SSSE3-NEXT: addq %rdx, %r11
+; SSSE3-NEXT: movq %rbp, %rax
+; SSSE3-NEXT: sarq $63, %rax
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: imulq %r13, %r14
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: imulq %r8, %r10
-; SSSE3-NEXT: addq %r14, %r10
-; SSSE3-NEXT: addq %rdx, %r10
-; SSSE3-NEXT: addq %r9, %r11
-; SSSE3-NEXT: adcq %rbx, %r10
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: addq %rax, %r14
+; SSSE3-NEXT: addq %rdx, %r14
+; SSSE3-NEXT: addq %r9, %r10
+; SSSE3-NEXT: adcq %r11, %r14
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: movq %r13, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: addq %r9, %r14
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: addq %r9, %rbx
; SSSE3-NEXT: adcq $0, %rsi
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %r14, %r9
+; SSSE3-NEXT: addq %rbx, %r9
; SSSE3-NEXT: adcq %rsi, %r8
; SSSE3-NEXT: setb %al
; SSSE3-NEXT: movzbl %al, %esi
-; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: movq %r13, %rax
; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: addq %r8, %rax
; SSSE3-NEXT: adcq %rsi, %rdx
-; SSSE3-NEXT: addq %r11, %rax
-; SSSE3-NEXT: adcq %r10, %rdx
-; SSSE3-NEXT: movq %r9, 24(%r12)
+; SSSE3-NEXT: addq %r10, %rax
+; SSSE3-NEXT: adcq %r14, %rdx
+; SSSE3-NEXT: movq %r9, 24(%r15)
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: xorq %r9, %rdx
; SSSE3-NEXT: xorq %rax, %r9
@@ -3547,11 +3537,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: negl %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: negl %r15d
-; SSSE3-NEXT: movd %r15d, %xmm0
+; SSSE3-NEXT: negl %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rbx, 16(%r12)
-; SSSE3-NEXT: movq %rdi, (%r12)
+; SSSE3-NEXT: movq %r11, 16(%r15)
+; SSSE3-NEXT: movq %rdi, (%r15)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -3569,112 +3559,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: movq %r8, %r14
+; SSE41-NEXT: movq %rcx, %r13
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rsi, %r11
; SSE41-NEXT: movq %rdi, %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movq %r11, %r12
-; SSE41-NEXT: sarq $63, %r12
-; SSE41-NEXT: movq %r14, %rbx
-; SSE41-NEXT: imulq %r12, %rbx
+; SSE41-NEXT: movq %r11, %rcx
+; SSE41-NEXT: sarq $63, %rcx
+; SSE41-NEXT: movq %r9, %r15
+; SSE41-NEXT: imulq %rcx, %r15
; SSE41-NEXT: movq %r14, %rax
-; SSE41-NEXT: mulq %r12
+; SSE41-NEXT: mulq %rcx
; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: imulq %r9, %r12
-; SSE41-NEXT: addq %rbx, %r12
-; SSE41-NEXT: addq %rdx, %r12
-; SSE41-NEXT: movq %r9, %rbx
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rbx, %r13
-; SSE41-NEXT: imulq %r11, %r13
-; SSE41-NEXT: movq %rbx, %rax
+; SSE41-NEXT: addq %rax, %r15
+; SSE41-NEXT: addq %rdx, %r15
+; SSE41-NEXT: movq %r9, %rax
+; SSE41-NEXT: sarq $63, %rax
+; SSE41-NEXT: movq %rax, %rcx
+; SSE41-NEXT: imulq %r11, %rcx
; SSE41-NEXT: mulq %r10
-; SSE41-NEXT: movq %rax, %r15
-; SSE41-NEXT: imulq %r10, %rbx
-; SSE41-NEXT: addq %r13, %rbx
-; SSE41-NEXT: addq %rdx, %rbx
-; SSE41-NEXT: addq %rdi, %r15
-; SSE41-NEXT: adcq %r12, %rbx
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: addq %rax, %rcx
+; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: addq %rdi, %rbx
+; SSE41-NEXT: adcq %r15, %rcx
; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r14
-; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: movq %rdx, %r15
; SSE41-NEXT: movq %rax, %rdi
; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r14
; SSE41-NEXT: movq %rdx, %r14
-; SSE41-NEXT: movq %rax, %r13
-; SSE41-NEXT: addq %r12, %r13
+; SSE41-NEXT: movq %rax, %r12
+; SSE41-NEXT: addq %r15, %r12
; SSE41-NEXT: adcq $0, %r14
; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: movq %rdx, %r15
; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: addq %r13, %r10
-; SSE41-NEXT: adcq %r14, %r12
+; SSE41-NEXT: addq %r12, %r10
+; SSE41-NEXT: adcq %r14, %r15
; SSE41-NEXT: setb %al
; SSE41-NEXT: movzbl %al, %r14d
; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: addq %r12, %rax
-; SSE41-NEXT: adcq %r14, %rdx
; SSE41-NEXT: addq %r15, %rax
-; SSE41-NEXT: adcq %rbx, %rdx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT: movq %r10, 8(%r12)
+; SSE41-NEXT: adcq %r14, %rdx
+; SSE41-NEXT: addq %rbx, %rax
+; SSE41-NEXT: adcq %rcx, %rdx
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: movq %r10, 8(%r15)
; SSE41-NEXT: sarq $63, %r10
; SSE41-NEXT: xorq %r10, %rdx
; SSE41-NEXT: xorq %rax, %r10
-; SSE41-NEXT: xorl %r15d, %r15d
+; SSE41-NEXT: xorl %ecx, %ecx
; SSE41-NEXT: orq %rdx, %r10
-; SSE41-NEXT: setne %r15b
-; SSE41-NEXT: movq %rcx, %rbx
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rsi, %r10
-; SSE41-NEXT: imulq %rbx, %r10
+; SSE41-NEXT: setne %cl
+; SSE41-NEXT: movq %r13, %r9
+; SSE41-NEXT: sarq $63, %r9
+; SSE41-NEXT: movq %rbp, %r11
+; SSE41-NEXT: imulq %r9, %r11
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: mulq %rbx
+; SSE41-NEXT: mulq %r9
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: imulq %rbp, %rbx
-; SSE41-NEXT: addq %r10, %rbx
-; SSE41-NEXT: addq %rdx, %rbx
-; SSE41-NEXT: movq %rbp, %r10
-; SSE41-NEXT: sarq $63, %r10
-; SSE41-NEXT: movq %r10, %r14
-; SSE41-NEXT: imulq %rcx, %r14
-; SSE41-NEXT: movq %r10, %rax
+; SSE41-NEXT: addq %rax, %r11
+; SSE41-NEXT: addq %rdx, %r11
+; SSE41-NEXT: movq %rbp, %rax
+; SSE41-NEXT: sarq $63, %rax
+; SSE41-NEXT: movq %rax, %r14
+; SSE41-NEXT: imulq %r13, %r14
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: imulq %r8, %r10
-; SSE41-NEXT: addq %r14, %r10
-; SSE41-NEXT: addq %rdx, %r10
-; SSE41-NEXT: addq %r9, %r11
-; SSE41-NEXT: adcq %rbx, %r10
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: addq %rax, %r14
+; SSE41-NEXT: addq %rdx, %r14
+; SSE41-NEXT: addq %r9, %r10
+; SSE41-NEXT: adcq %r11, %r14
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %r9
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq %rax, %r11
+; SSE41-NEXT: movq %r13, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: addq %r9, %r14
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: addq %r9, %rbx
; SSE41-NEXT: adcq $0, %rsi
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %r14, %r9
+; SSE41-NEXT: addq %rbx, %r9
; SSE41-NEXT: adcq %rsi, %r8
; SSE41-NEXT: setb %al
; SSE41-NEXT: movzbl %al, %esi
-; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq %r13, %rax
; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: addq %r8, %rax
; SSE41-NEXT: adcq %rsi, %rdx
-; SSE41-NEXT: addq %r11, %rax
-; SSE41-NEXT: adcq %r10, %rdx
-; SSE41-NEXT: movq %r9, 24(%r12)
+; SSE41-NEXT: addq %r10, %rax
+; SSE41-NEXT: adcq %r14, %rdx
+; SSE41-NEXT: movq %r9, 24(%r15)
; SSE41-NEXT: sarq $63, %r9
; SSE41-NEXT: xorq %r9, %rdx
; SSE41-NEXT: xorq %rax, %r9
@@ -3682,11 +3667,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: orq %rdx, %r9
; SSE41-NEXT: setne %al
; SSE41-NEXT: negl %eax
-; SSE41-NEXT: negl %r15d
-; SSE41-NEXT: movd %r15d, %xmm0
+; SSE41-NEXT: negl %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %rbx, 16(%r12)
-; SSE41-NEXT: movq %rdi, (%r12)
+; SSE41-NEXT: movq %r11, 16(%r15)
+; SSE41-NEXT: movq %rdi, (%r15)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -3704,112 +3689,107 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %r8, %r14
+; AVX-NEXT: movq %rcx, %r13
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rsi, %r11
; AVX-NEXT: movq %rdi, %r10
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT: movq %r11, %r12
-; AVX-NEXT: sarq $63, %r12
-; AVX-NEXT: movq %r14, %rbx
-; AVX-NEXT: imulq %r12, %rbx
+; AVX-NEXT: movq %r11, %rcx
+; AVX-NEXT: sarq $63, %rcx
+; AVX-NEXT: movq %r9, %r15
+; AVX-NEXT: imulq %rcx, %r15
; AVX-NEXT: movq %r14, %rax
-; AVX-NEXT: mulq %r12
+; AVX-NEXT: mulq %rcx
; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: imulq %r9, %r12
-; AVX-NEXT: addq %rbx, %r12
-; AVX-NEXT: addq %rdx, %r12
-; AVX-NEXT: movq %r9, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rbx, %r13
-; AVX-NEXT: imulq %r11, %r13
-; AVX-NEXT: movq %rbx, %rax
+; AVX-NEXT: addq %rax, %r15
+; AVX-NEXT: addq %rdx, %r15
+; AVX-NEXT: movq %r9, %rax
+; AVX-NEXT: sarq $63, %rax
+; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: imulq %r11, %rcx
; AVX-NEXT: mulq %r10
-; AVX-NEXT: movq %rax, %r15
-; AVX-NEXT: imulq %r10, %rbx
-; AVX-NEXT: addq %r13, %rbx
-; AVX-NEXT: addq %rdx, %rbx
-; AVX-NEXT: addq %rdi, %r15
-; AVX-NEXT: adcq %r12, %rbx
+; AVX-NEXT: movq %rax, %rbx
+; AVX-NEXT: addq %rax, %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: addq %rdi, %rbx
+; AVX-NEXT: adcq %r15, %rcx
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r14
-; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rdx, %r15
; AVX-NEXT: movq %rax, %rdi
; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r14
; AVX-NEXT: movq %rdx, %r14
-; AVX-NEXT: movq %rax, %r13
-; AVX-NEXT: addq %r12, %r13
+; AVX-NEXT: movq %rax, %r12
+; AVX-NEXT: addq %r15, %r12
; AVX-NEXT: adcq $0, %r14
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rdx, %r15
; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: addq %r13, %r10
-; AVX-NEXT: adcq %r14, %r12
+; AVX-NEXT: addq %r12, %r10
+; AVX-NEXT: adcq %r14, %r15
; AVX-NEXT: setb %al
; AVX-NEXT: movzbl %al, %r14d
; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r9
-; AVX-NEXT: addq %r12, %rax
-; AVX-NEXT: adcq %r14, %rdx
; AVX-NEXT: addq %r15, %rax
-; AVX-NEXT: adcq %rbx, %rdx
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT: movq %r10, 8(%r12)
+; AVX-NEXT: adcq %r14, %rdx
+; AVX-NEXT: addq %rbx, %rax
+; AVX-NEXT: adcq %rcx, %rdx
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: movq %r10, 8(%r15)
; AVX-NEXT: sarq $63, %r10
; AVX-NEXT: xorq %r10, %rdx
; AVX-NEXT: xorq %rax, %r10
-; AVX-NEXT: xorl %r15d, %r15d
+; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: orq %rdx, %r10
-; AVX-NEXT: setne %r15b
-; AVX-NEXT: movq %rcx, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rsi, %r10
-; AVX-NEXT: imulq %rbx, %r10
+; AVX-NEXT: setne %cl
+; AVX-NEXT: movq %r13, %r9
+; AVX-NEXT: sarq $63, %r9
+; AVX-NEXT: movq %rbp, %r11
+; AVX-NEXT: imulq %r9, %r11
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: mulq %rbx
+; AVX-NEXT: mulq %r9
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: imulq %rbp, %rbx
-; AVX-NEXT: addq %r10, %rbx
-; AVX-NEXT: addq %rdx, %rbx
-; AVX-NEXT: movq %rbp, %r10
-; AVX-NEXT: sarq $63, %r10
-; AVX-NEXT: movq %r10, %r14
-; AVX-NEXT: imulq %rcx, %r14
-; AVX-NEXT: movq %r10, %rax
+; AVX-NEXT: addq %rax, %r11
+; AVX-NEXT: addq %rdx, %r11
+; AVX-NEXT: movq %rbp, %rax
+; AVX-NEXT: sarq $63, %rax
+; AVX-NEXT: movq %rax, %r14
+; AVX-NEXT: imulq %r13, %r14
; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: imulq %r8, %r10
-; AVX-NEXT: addq %r14, %r10
-; AVX-NEXT: addq %rdx, %r10
-; AVX-NEXT: addq %r9, %r11
-; AVX-NEXT: adcq %rbx, %r10
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: addq %rax, %r14
+; AVX-NEXT: addq %rdx, %r14
+; AVX-NEXT: addq %r9, %r10
+; AVX-NEXT: adcq %r11, %r14
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: movq %rax, %r11
+; AVX-NEXT: movq %r13, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: addq %r9, %r14
+; AVX-NEXT: movq %rax, %rbx
+; AVX-NEXT: addq %r9, %rbx
; AVX-NEXT: adcq $0, %rsi
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rbp
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %r14, %r9
+; AVX-NEXT: addq %rbx, %r9
; AVX-NEXT: adcq %rsi, %r8
; AVX-NEXT: setb %al
; AVX-NEXT: movzbl %al, %esi
-; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: movq %r13, %rax
; AVX-NEXT: mulq %rbp
; AVX-NEXT: addq %r8, %rax
; AVX-NEXT: adcq %rsi, %rdx
-; AVX-NEXT: addq %r11, %rax
-; AVX-NEXT: adcq %r10, %rdx
-; AVX-NEXT: movq %r9, 24(%r12)
+; AVX-NEXT: addq %r10, %rax
+; AVX-NEXT: adcq %r14, %rdx
+; AVX-NEXT: movq %r9, 24(%r15)
; AVX-NEXT: sarq $63, %r9
; AVX-NEXT: xorq %r9, %rdx
; AVX-NEXT: xorq %rax, %r9
@@ -3817,11 +3797,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: orq %rdx, %r9
; AVX-NEXT: setne %al
; AVX-NEXT: negl %eax
-; AVX-NEXT: negl %r15d
-; AVX-NEXT: vmovd %r15d, %xmm0
+; AVX-NEXT: negl %ecx
+; AVX-NEXT: vmovd %ecx, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rbx, 16(%r12)
-; AVX-NEXT: movq %rdi, (%r12)
+; AVX-NEXT: movq %r11, 16(%r15)
+; AVX-NEXT: movq %rdi, (%r15)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
@@ -3838,113 +3818,104 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %r9, %rbp
; AVX512F-NEXT: movq %rcx, %r11
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rsi, %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %rcx, %r12
-; AVX512F-NEXT: sarq $63, %r12
-; AVX512F-NEXT: movq %r15, %rbx
-; AVX512F-NEXT: imulq %r12, %rbx
-; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %r12
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: imulq %rsi, %r12
-; AVX512F-NEXT: addq %rbx, %r12
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX512F-NEXT: sarq $63, %rcx
+; AVX512F-NEXT: movq %rbp, %r12
+; AVX512F-NEXT: imulq %rcx, %r12
+; AVX512F-NEXT: movq %r14, %rax
+; AVX512F-NEXT: mulq %rcx
+; AVX512F-NEXT: movq %rax, %r15
+; AVX512F-NEXT: addq %rax, %r12
; AVX512F-NEXT: addq %rdx, %r12
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: sarq $63, %rbx
-; AVX512F-NEXT: movq %rbx, %r13
-; AVX512F-NEXT: imulq %r11, %r13
-; AVX512F-NEXT: movq %rbx, %rax
+; AVX512F-NEXT: movq %rbp, %rax
+; AVX512F-NEXT: sarq $63, %rax
+; AVX512F-NEXT: movq %rax, %rcx
+; AVX512F-NEXT: imulq %r11, %rcx
; AVX512F-NEXT: mulq %r10
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: imulq %r10, %rbx
-; AVX512F-NEXT: addq %r13, %rbx
-; AVX512F-NEXT: addq %rdx, %rbx
-; AVX512F-NEXT: addq %rcx, %r14
-; AVX512F-NEXT: adcq %r12, %rbx
+; AVX512F-NEXT: movq %rax, %rbx
+; AVX512F-NEXT: addq %rax, %rcx
+; AVX512F-NEXT: addq %rdx, %rcx
+; AVX512F-NEXT: addq %r15, %rbx
+; AVX512F-NEXT: adcq %r12, %rcx
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rdx, %r12
-; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %r15
+; AVX512F-NEXT: mulq %r14
; AVX512F-NEXT: movq %rdx, %r15
-; AVX512F-NEXT: movq %rax, %r13
-; AVX512F-NEXT: addq %r12, %r13
-; AVX512F-NEXT: adcq $0, %r15
+; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: mulq %r14
+; AVX512F-NEXT: movq %rdx, %r14
+; AVX512F-NEXT: movq %rax, %r12
+; AVX512F-NEXT: addq %r15, %r12
+; AVX512F-NEXT: adcq $0, %r14
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %rsi
-; AVX512F-NEXT: movq %rdx, %r12
+; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movq %rdx, %r15
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r13, %r10
-; AVX512F-NEXT: adcq %r15, %r12
+; AVX512F-NEXT: addq %r12, %r10
+; AVX512F-NEXT: adcq %r14, %r15
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %r15d
+; AVX512F-NEXT: movzbl %al, %r14d
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rsi
-; AVX512F-NEXT: addq %r12, %rax
-; AVX512F-NEXT: adcq %r15, %rdx
-; AVX512F-NEXT: addq %r14, %rax
-; AVX512F-NEXT: adcq %rbx, %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: movq %r10, 24(%r12)
+; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: addq %r15, %rax
+; AVX512F-NEXT: adcq %r14, %rdx
+; AVX512F-NEXT: addq %rbx, %rax
+; AVX512F-NEXT: adcq %rcx, %rdx
+; AVX512F-NEXT: movq %r10, 24(%r13)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: sarq $63, %rsi
-; AVX512F-NEXT: movq %r8, %r11
-; AVX512F-NEXT: imulq %rsi, %r11
+; AVX512F-NEXT: movq %rsi, %rcx
+; AVX512F-NEXT: sarq $63, %rcx
+; AVX512F-NEXT: movq %r9, %rbx
+; AVX512F-NEXT: imulq %rcx, %rbx
; AVX512F-NEXT: movq %r8, %rax
-; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: mulq %rcx
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: imulq %rbp, %rsi
-; AVX512F-NEXT: addq %r11, %rsi
-; AVX512F-NEXT: addq %rdx, %rsi
-; AVX512F-NEXT: movq %rbp, %r11
-; AVX512F-NEXT: sarq $63, %r11
-; AVX512F-NEXT: movq %r11, %r14
-; AVX512F-NEXT: imulq %r9, %r14
-; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: addq %rax, %rbx
+; AVX512F-NEXT: addq %rdx, %rbx
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: sarq $63, %rax
+; AVX512F-NEXT: movq %rax, %rcx
+; AVX512F-NEXT: imulq %rsi, %rcx
; AVX512F-NEXT: mulq %rdi
-; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: imulq %rdi, %r11
-; AVX512F-NEXT: addq %r14, %r11
-; AVX512F-NEXT: addq %rdx, %r11
-; AVX512F-NEXT: addq %r10, %rbx
-; AVX512F-NEXT: adcq %rsi, %r11
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: addq %rax, %rcx
+; AVX512F-NEXT: addq %rdx, %rcx
+; AVX512F-NEXT: addq %r10, %r11
+; AVX512F-NEXT: adcq %rbx, %rcx
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: movq %rax, %rbx
+; AVX512F-NEXT: movq %rsi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rax, %r15
-; AVX512F-NEXT: addq %r10, %r15
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: addq %r10, %r14
; AVX512F-NEXT: adcq $0, %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: mulq %r9
; AVX512F-NEXT: movq %rdx, %rdi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r15, %r10
+; AVX512F-NEXT: addq %r14, %r10
; AVX512F-NEXT: adcq %r8, %rdi
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %esi
-; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movzbl %al, %r8d
+; AVX512F-NEXT: movq %rsi, %rax
+; AVX512F-NEXT: mulq %r9
; AVX512F-NEXT: addq %rdi, %rax
-; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: addq %rbx, %rax
-; AVX512F-NEXT: adcq %r11, %rdx
-; AVX512F-NEXT: movq %r10, 8(%r12)
+; AVX512F-NEXT: adcq %r8, %rdx
+; AVX512F-NEXT: addq %r11, %rax
+; AVX512F-NEXT: adcq %rcx, %rdx
+; AVX512F-NEXT: movq %r10, 8(%r13)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
@@ -3956,8 +3927,9 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq %rcx, 16(%r12)
-; AVX512F-NEXT: movq %r14, (%r12)
+; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT: movq %rax, 16(%r13)
+; AVX512F-NEXT: movq %rbx, (%r13)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
@@ -3974,113 +3946,104 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: movq %r9, %rbp
; AVX512BW-NEXT: movq %rcx, %r11
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rsi, %r9
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512BW-NEXT: movq %rcx, %r12
-; AVX512BW-NEXT: sarq $63, %r12
-; AVX512BW-NEXT: movq %r15, %rbx
-; AVX512BW-NEXT: imulq %r12, %rbx
-; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %r12
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: imulq %rsi, %r12
-; AVX512BW-NEXT: addq %rbx, %r12
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX512BW-NEXT: sarq $63, %rcx
+; AVX512BW-NEXT: movq %rbp, %r12
+; AVX512BW-NEXT: imulq %rcx, %r12
+; AVX512BW-NEXT: movq %r14, %rax
+; AVX512BW-NEXT: mulq %rcx
+; AVX512BW-NEXT: movq %rax, %r15
+; AVX512BW-NEXT: addq %rax, %r12
; AVX512BW-NEXT: addq %rdx, %r12
-; AVX512BW-NEXT: movq %rsi, %rbx
-; AVX512BW-NEXT: sarq $63, %rbx
-; AVX512BW-NEXT: movq %rbx, %r13
-; AVX512BW-NEXT: imulq %r11, %r13
-; AVX512BW-NEXT: movq %rbx, %rax
+; AVX512BW-NEXT: movq %rbp, %rax
+; AVX512BW-NEXT: sarq $63, %rax
+; AVX512BW-NEXT: movq %rax, %rcx
+; AVX512BW-NEXT: imulq %r11, %rcx
; AVX512BW-NEXT: mulq %r10
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: imulq %r10, %rbx
-; AVX512BW-NEXT: addq %r13, %rbx
-; AVX512BW-NEXT: addq %rdx, %rbx
-; AVX512BW-NEXT: addq %rcx, %r14
-; AVX512BW-NEXT: adcq %r12, %rbx
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: addq %rax, %rcx
+; AVX512BW-NEXT: addq %rdx, %rcx
+; AVX512BW-NEXT: addq %r15, %rbx
+; AVX512BW-NEXT: adcq %r12, %rcx
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rdx, %r12
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %r15
+; AVX512BW-NEXT: mulq %r14
; AVX512BW-NEXT: movq %rdx, %r15
-; AVX512BW-NEXT: movq %rax, %r13
-; AVX512BW-NEXT: addq %r12, %r13
-; AVX512BW-NEXT: adcq $0, %r15
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: mulq %r14
+; AVX512BW-NEXT: movq %rdx, %r14
+; AVX512BW-NEXT: movq %rax, %r12
+; AVX512BW-NEXT: addq %r15, %r12
+; AVX512BW-NEXT: adcq $0, %r14
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %rsi
-; AVX512BW-NEXT: movq %rdx, %r12
+; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movq %rdx, %r15
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r13, %r10
-; AVX512BW-NEXT: adcq %r15, %r12
+; AVX512BW-NEXT: addq %r12, %r10
+; AVX512BW-NEXT: adcq %r14, %r15
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %r15d
+; AVX512BW-NEXT: movzbl %al, %r14d
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rsi
-; AVX512BW-NEXT: addq %r12, %rax
-; AVX512BW-NEXT: adcq %r15, %rdx
-; AVX512BW-NEXT: addq %r14, %rax
-; AVX512BW-NEXT: adcq %rbx, %rdx
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512BW-NEXT: movq %r10, 24(%r12)
+; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: addq %r15, %rax
+; AVX512BW-NEXT: adcq %r14, %rdx
+; AVX512BW-NEXT: addq %rbx, %rax
+; AVX512BW-NEXT: adcq %rcx, %rdx
+; AVX512BW-NEXT: movq %r10, 24(%r13)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %r9, %rsi
-; AVX512BW-NEXT: sarq $63, %rsi
-; AVX512BW-NEXT: movq %r8, %r11
-; AVX512BW-NEXT: imulq %rsi, %r11
+; AVX512BW-NEXT: movq %rsi, %rcx
+; AVX512BW-NEXT: sarq $63, %rcx
+; AVX512BW-NEXT: movq %r9, %rbx
+; AVX512BW-NEXT: imulq %rcx, %rbx
; AVX512BW-NEXT: movq %r8, %rax
-; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: mulq %rcx
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: imulq %rbp, %rsi
-; AVX512BW-NEXT: addq %r11, %rsi
-; AVX512BW-NEXT: addq %rdx, %rsi
-; AVX512BW-NEXT: movq %rbp, %r11
-; AVX512BW-NEXT: sarq $63, %r11
-; AVX512BW-NEXT: movq %r11, %r14
-; AVX512BW-NEXT: imulq %r9, %r14
-; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: addq %rax, %rbx
+; AVX512BW-NEXT: addq %rdx, %rbx
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: sarq $63, %rax
+; AVX512BW-NEXT: movq %rax, %rcx
+; AVX512BW-NEXT: imulq %rsi, %rcx
; AVX512BW-NEXT: mulq %rdi
-; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: imulq %rdi, %r11
-; AVX512BW-NEXT: addq %r14, %r11
-; AVX512BW-NEXT: addq %rdx, %r11
-; AVX512BW-NEXT: addq %r10, %rbx
-; AVX512BW-NEXT: adcq %rsi, %r11
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: addq %rax, %rcx
+; AVX512BW-NEXT: addq %rdx, %rcx
+; AVX512BW-NEXT: addq %r10, %r11
+; AVX512BW-NEXT: adcq %rbx, %rcx
; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: movq %rsi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r8
-; AVX512BW-NEXT: movq %rax, %r15
-; AVX512BW-NEXT: addq %r10, %r15
+; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: addq %r10, %r14
; AVX512BW-NEXT: adcq $0, %r8
; AVX512BW-NEXT: movq %rdi, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: mulq %r9
; AVX512BW-NEXT: movq %rdx, %rdi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r15, %r10
+; AVX512BW-NEXT: addq %r14, %r10
; AVX512BW-NEXT: adcq %r8, %rdi
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %esi
-; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movzbl %al, %r8d
+; AVX512BW-NEXT: movq %rsi, %rax
+; AVX512BW-NEXT: mulq %r9
; AVX512BW-NEXT: addq %rdi, %rax
-; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: addq %rbx, %rax
-; AVX512BW-NEXT: adcq %r11, %rdx
-; AVX512BW-NEXT: movq %r10, 8(%r12)
+; AVX512BW-NEXT: adcq %r8, %rdx
+; AVX512BW-NEXT: addq %r11, %rax
+; AVX512BW-NEXT: adcq %rcx, %rdx
+; AVX512BW-NEXT: movq %r10, 8(%r13)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
@@ -4092,8 +4055,9 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq %rcx, 16(%r12)
-; AVX512BW-NEXT: movq %r14, (%r12)
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: movq %rax, 16(%r13)
+; AVX512BW-NEXT: movq %rbx, (%r13)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 8d68303300ec6..cbbe089c80192 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -212,68 +212,66 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $8, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %ebx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebp, %esi
-; WIN32-NEXT: imull %ebx, %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: subl $12, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: imull %eax, %ebx
-; WIN32-NEXT: addl %esi, %ebx
-; WIN32-NEXT: addl %edx, %ebx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edx
-; WIN32-NEXT: imull %ecx, %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: imull %ecx, %edi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl %edi, %eax
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ebx, %esi
-; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: imull %ebp, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: addl %esi, %eax
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %edi, %ecx
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %ebx, %esi
; WIN32-NEXT: adcl $0, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %ecx, %ebp
-; WIN32-NEXT: adcl %edi, %ebx
-; WIN32-NEXT: setb %cl
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %esi, %ebx
+; WIN32-NEXT: adcl %edi, %ebp
+; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %ecx, %edx
+; WIN32-NEXT: movl %ebx, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
+; WIN32-NEXT: movl %ebx, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
-; WIN32-NEXT: addl $8, %esp
+; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
@@ -572,65 +570,65 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ebp, %edi
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: imull %ebx, %ecx
-; WIN32-NEXT: addl %edi, %ecx
+; WIN32-NEXT: movl %ecx, %edx
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: sarl $31, %edx
+; WIN32-NEXT: movl %esi, %ecx
+; WIN32-NEXT: imull %edx, %ecx
+; WIN32-NEXT: mull %edx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %eax, %ecx
; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %edx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: imull %edi, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: imull %edi, %ebx
-; WIN32-NEXT: addl %edx, %ebx
; WIN32-NEXT: mull %edi
-; WIN32-NEXT: addl %edx, %ebx
-; WIN32-NEXT: addl %esi, %eax
+; WIN32-NEXT: addl %eax, %esi
+; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: addl %ebp, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %ebx
+; WIN32-NEXT: adcl %ecx, %esi
; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl %esi, %edi
-; WIN32-NEXT: adcl $0, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl %ebp, %ecx
+; WIN32-NEXT: adcl $0, %ebx
+; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: adcl %ecx, %ebp
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: adcl %ebx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: setb %cl
-; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %ebx, %edx
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: xorl %esi, %edx
-; WIN32-NEXT: xorl %eax, %esi
+; WIN32-NEXT: adcl %esi, %edx
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: xorl %ebp, %edx
+; WIN32-NEXT: xorl %eax, %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: orl %edx, %esi
+; WIN32-NEXT: orl %edx, %ebp
; WIN32-NEXT: jne LBB12_2
; WIN32-NEXT: # %bb.1:
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: LBB12_2:
-; WIN32-NEXT: movl %edi, %edx
+; WIN32-NEXT: movl %ebx, %edx
; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
@@ -991,57 +989,54 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %ebp, %esi
-; WIN32-NEXT: imull %edi, %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: subl $8, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: imull %eax, %edi
-; WIN32-NEXT: addl %esi, %edi
-; WIN32-NEXT: addl %edx, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edx
-; WIN32-NEXT: imull %ecx, %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %ebx, %esi
; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %esi
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl %ecx, %ebx
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: imull %edi, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: mull %ebx
+; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %edx, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %edi, %ecx
-; WIN32-NEXT: adcl $0, %ebp
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %edi, %ebp
+; WIN32-NEXT: adcl $0, %esi
; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl %ecx, %edi
-; WIN32-NEXT: adcl %ebp, %ebx
-; WIN32-NEXT: setb %cl
+; WIN32-NEXT: addl %ebp, %edi
+; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; WIN32-NEXT: adcl %esi, %edx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %ecx, %edx
; WIN32-NEXT: sarl $31, %edi
; WIN32-NEXT: xorl %edi, %edx
; WIN32-NEXT: xorl %eax, %edi
@@ -1050,7 +1045,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: # %bb.3: # %continue
; WIN32-NEXT: movb $1, %al
; WIN32-NEXT: LBB18_2: # %overflow
-; WIN32-NEXT: addl $4, %esp
+; WIN32-NEXT: addl $8, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
@@ -1699,69 +1694,68 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $16, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: subl $20, %esp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %esi
-; WIN32-NEXT: movl 4(%eax), %ebp
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %ecx
-; WIN32-NEXT: imull %ebp, %ecx
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: imull %esi, %ebx
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: addl %edx, %ebx
-; WIN32-NEXT: movl %ebp, %ecx
-; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl (%eax), %edx
+; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl 4(%eax), %ebx
+; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: imull %ebx, %esi
+; WIN32-NEXT: mull %edx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %eax, %esi
+; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: imull %edi, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: addl %edi, %ecx
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: addl %eax, %ecx
; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ebx, %ecx
+; WIN32-NEXT: addl %eax, %ebp
+; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull %edi
; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl %ebx, %edi
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; WIN32-NEXT: adcl $0, %ebp
; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: adcl %ebp, %ebx
-; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %edi, %ebx
+; WIN32-NEXT: adcl %ebp, %esi
+; WIN32-NEXT: setb (%esp) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %edi, %edx
+; WIN32-NEXT: addl %esi, %eax
+; WIN32-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload
+; WIN32-NEXT: adcl %esi, %edx
; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: movl %esi, %ecx
+; WIN32-NEXT: movl %ebx, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %esi, 4(%eax)
+; WIN32-NEXT: movl %ebx, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
-; WIN32-NEXT: addl $16, %esp
+; WIN32-NEXT: addl $20, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
@@ -1810,62 +1804,58 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %ebx
; WIN32-NEXT: movl 4(%eax), %ebp
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %ebx, %esi
-; WIN32-NEXT: imull %edi, %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: imull %ebp, %edi
-; WIN32-NEXT: addl %esi, %edi
-; WIN32-NEXT: addl %edx, %edi
-; WIN32-NEXT: movl %ebp, %esi
; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edx
-; WIN32-NEXT: imull %ecx, %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %ebp, %edi
+; WIN32-NEXT: imull %ecx, %edi
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: sarl $31, %eax
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: addl %esi, %eax
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %esi
-; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: adcl %edi, %ecx
+; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %ebp, %esi
; WIN32-NEXT: adcl $0, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %ecx, %ebp
-; WIN32-NEXT: adcl %edi, %ebx
-; WIN32-NEXT: setb %cl
+; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %esi, %ebx
+; WIN32-NEXT: adcl %edi, %ebp
+; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %ecx, %edx
+; WIN32-NEXT: movl %ebx, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
-; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT: movl %ebx, 4(%eax)
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: addl $16, %esp
More information about the llvm-commits
mailing list