[llvm] db25f51 - Revert "[DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))"
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 22 22:51:48 PDT 2022
Author: Craig Topper
Date: 2022-10-22T22:50:43-07:00
New Revision: db25f51e37baf3b804ec541a834d3fd0b6b44118
URL: https://github.com/llvm/llvm-project/commit/db25f51e37baf3b804ec541a834d3fd0b6b44118
DIFF: https://github.com/llvm/llvm-project/commit/db25f51e37baf3b804ec541a834d3fd0b6b44118.diff
LOG: Revert "[DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))"
This reverts commit e8b3ffa532b8ebac5dcdf17bb91b47817382c14d.
The AMDGPU/mad_64_32.ll seems to fail on some of the build bots but
passes locally. I'm really confused.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
llvm/test/CodeGen/AMDGPU/mad_64_32.ll
llvm/test/CodeGen/PowerPC/pr45448.ll
llvm/test/CodeGen/RISCV/mul.ll
llvm/test/CodeGen/RISCV/xaluo.ll
llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
llvm/test/CodeGen/X86/extmul128.ll
llvm/test/CodeGen/X86/muloti.ll
llvm/test/CodeGen/X86/smul_fix_sat.ll
llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/xmulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 44ce4947a6ab8..a4c04b525bdf0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3957,30 +3957,6 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) {
return SDValue();
}
-// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
-static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1,
- SelectionDAG &DAG) {
- if (N0.getOpcode() != ISD::SRA)
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // TODO: Use computeNumSignBits() == BitWidth?
- unsigned BitWidth = VT.getScalarSizeInBits();
- ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1));
- if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1))
- return SDValue();
-
- // If optimizing for minsize, we don't want to increase the number of
- // instructions.
- if (DAG.getMachineFunction().getFunction().hasMinSize())
- return SDValue();
-
- SDLoc dl(N);
- SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1);
- return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And);
-}
-
SDValue DAGCombiner::visitMUL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -4191,11 +4167,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
}
}
- if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG))
- return V;
- if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG))
- return V;
-
// reassociate mul
if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
return RMUL;
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index c01ec69629f30..e955014371525 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -39,24 +39,21 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
; AARCH: // %bb.0: // %Entry
; AARCH-NEXT: asr x9, x1, #63
; AARCH-NEXT: asr x10, x3, #63
-; AARCH-NEXT: and x11, x9, x2
-; AARCH-NEXT: and x14, x10, x1
-; AARCH-NEXT: umulh x12, x2, x9
-; AARCH-NEXT: and x9, x9, x3
-; AARCH-NEXT: umulh x13, x10, x0
-; AARCH-NEXT: and x10, x10, x0
-; AARCH-NEXT: sub x12, x12, x11
-; AARCH-NEXT: neg x11, x11
-; AARCH-NEXT: sub x13, x13, x14
-; AARCH-NEXT: sub x9, x12, x9
-; AARCH-NEXT: sub x12, x13, x10
-; AARCH-NEXT: neg x10, x10
; AARCH-NEXT: umulh x14, x0, x2
+; AARCH-NEXT: mov x8, x1
+; AARCH-NEXT: mul x11, x2, x9
+; AARCH-NEXT: str wzr, [x4]
+; AARCH-NEXT: umulh x12, x10, x0
+; AARCH-NEXT: umulh x13, x2, x9
+; AARCH-NEXT: madd x12, x10, x1, x12
+; AARCH-NEXT: add x13, x13, x11
+; AARCH-NEXT: mul x10, x10, x0
+; AARCH-NEXT: madd x9, x3, x9, x13
+; AARCH-NEXT: add x12, x12, x10
; AARCH-NEXT: adds x10, x10, x11
; AARCH-NEXT: mul x11, x1, x2
; AARCH-NEXT: adc x9, x12, x9
; AARCH-NEXT: umulh x13, x1, x2
-; AARCH-NEXT: mov x8, x1
; AARCH-NEXT: mul x12, x0, x3
; AARCH-NEXT: adds x11, x11, x14
; AARCH-NEXT: umulh x14, x0, x3
@@ -76,7 +73,6 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
; AARCH-NEXT: eor x9, x9, x11
; AARCH-NEXT: eor x10, x10, x11
; AARCH-NEXT: orr x9, x10, x9
-; AARCH-NEXT: str wzr, [x4]
; AARCH-NEXT: cmp x9, #0
; AARCH-NEXT: cset w9, ne
; AARCH-NEXT: tbz x8, #63, .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index fc65050be9f92..f806149d0c395 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -159,28 +159,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; CI-NEXT: v_ashrrev_i32_e32 v11, 31, v0
+; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0
; CI-NEXT: v_mov_b32_e32 v8, 0
-; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8]
-; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v1
-; CI-NEXT: v_and_b32_e32 v14, v11, v1
-; CI-NEXT: v_mov_b32_e32 v1, v10
+; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
+; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1
+; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
+; CI-NEXT: v_mov_b32_e32 v7, v10
; CI-NEXT: v_mov_b32_e32 v10, v8
-; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10]
-; CI-NEXT: v_and_b32_e32 v13, v11, v12
-; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14
-; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc
-; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10]
-; CI-NEXT: v_mov_b32_e32 v0, v8
-; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1]
-; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9
-; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc
-; CI-NEXT: v_mov_b32_e32 v1, v7
+; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
+; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
+; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9
+; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
+; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0
+; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v1, v8
; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc
+; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -188,64 +184,60 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; SI-NEXT: v_mul_lo_u32 v11, v6, v1
+; SI-NEXT: v_mul_hi_u32 v12, v0, v1
; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; SI-NEXT: v_and_b32_e32 v9, v6, v1
-; SI-NEXT: v_and_b32_e32 v10, v7, v0
-; SI-NEXT: v_mul_lo_u32 v13, v6, v1
-; SI-NEXT: v_mul_hi_u32 v14, v0, v1
-; SI-NEXT: v_and_b32_e32 v8, v6, v7
-; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; SI-NEXT: v_mul_hi_u32 v10, v6, v7
-; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7
-; SI-NEXT: v_mul_hi_u32 v6, v6, v1
-; SI-NEXT: v_mul_hi_u32 v12, v0, v7
-; SI-NEXT: v_mul_lo_u32 v7, v0, v7
-; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc
-; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
-; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; SI-NEXT: v_mul_hi_u32 v14, v6, v1
+; SI-NEXT: v_mul_lo_u32 v13, v0, v7
+; SI-NEXT: v_mul_hi_u32 v10, v0, v7
+; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12
+; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
+; SI-NEXT: v_mul_hi_u32 v8, v6, v7
+; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7
+; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10
+; SI-NEXT: v_mul_hi_i32 v6, v1, v6
+; SI-NEXT: v_mul_hi_i32 v7, v7, v0
+; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
+; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc
+; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11
; SI-NEXT: v_mul_lo_u32 v0, v0, v1
-; SI-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v9
-; SI-NEXT: v_subb_u32_e32 v8, vcc, v10, v8, vcc
+; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
+; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10
+; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc
-; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc
-; SI-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc
+; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1
-; GFX9-NEXT: v_and_b32_e32 v6, v14, v1
-; GFX9-NEXT: v_mov_b32_e32 v11, 0
-; GFX9-NEXT: v_mov_b32_e32 v10, v9
-; GFX9-NEXT: v_and_b32_e32 v7, v14, v15
-; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6
-; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11]
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mov_b32_e32 v10, v13
-; GFX9-NEXT: v_mov_b32_e32 v13, v11
-; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13]
-; GFX9-NEXT: v_mov_b32_e32 v12, v1
-; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-NEXT: v_mov_b32_e32 v11, v9
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
+; GFX9-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
+; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
+; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, v10
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -254,30 +246,27 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
; GFX11-NEXT: v_mov_b32_e32 v8, 0
-; GFX11-NEXT: v_ashrrev_i32_e32 v16, 31, v0
-; GFX11-NEXT: v_ashrrev_i32_e32 v17, 31, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
+; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
-; GFX11-NEXT: v_and_b32_e32 v8, v16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10]
-; GFX11-NEXT: v_and_b32_e32 v9, v16, v17
-; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX11-NEXT: v_mov_b32_e32 v1, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
+; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
+; GFX11-NEXT: v_mov_b32_e32 v8, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9]
-; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1
-; GFX11-NEXT: v_mov_b32_e32 v7, v11
-; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0
+; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
+; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13]
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index c3337c78a4770..0f8014df8adca 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -25,8 +25,7 @@ define hidden void @julia_tryparse_internal_45896() #0 {
; CHECK-NEXT: rldic r5, r5, 4, 32
; CHECK-NEXT: crnot 4*cr5+lt, eq
; CHECK-NEXT: mulhdu r3, r3, r5
-; CHECK-NEXT: and r6, r4, r5
-; CHECK-NEXT: sub r6, r3, r6
+; CHECK-NEXT: maddld r6, r4, r5, r3
; CHECK-NEXT: cmpld cr1, r6, r3
; CHECK-NEXT: mulhdu. r3, r4, r5
; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_10
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 986e799428e57..3923c4340d30e 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1480,18 +1480,18 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
; RV32IM-NEXT: add a5, a6, a2
; RV32IM-NEXT: mul a7, a1, a3
; RV32IM-NEXT: add t0, a7, a5
-; RV32IM-NEXT: and t1, a4, a0
-; RV32IM-NEXT: sub a2, t0, t1
+; RV32IM-NEXT: mul t1, a4, a0
+; RV32IM-NEXT: add a2, t0, t1
; RV32IM-NEXT: sltu t2, a2, t0
; RV32IM-NEXT: sltu a7, t0, a7
; RV32IM-NEXT: sltu a5, a5, a6
; RV32IM-NEXT: mulhu a3, a1, a3
; RV32IM-NEXT: add a3, a3, a5
; RV32IM-NEXT: add a3, a3, a7
-; RV32IM-NEXT: and a1, a4, a1
+; RV32IM-NEXT: mul a1, a4, a1
; RV32IM-NEXT: mulhu a0, a4, a0
-; RV32IM-NEXT: sub a0, a0, a1
-; RV32IM-NEXT: sub a0, a0, t1
+; RV32IM-NEXT: add a0, a0, a1
+; RV32IM-NEXT: add a0, a0, t1
; RV32IM-NEXT: add a0, a3, a0
; RV32IM-NEXT: add a1, a0, t2
; RV32IM-NEXT: mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index f3391b2816495..f6963fd674d3e 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -961,10 +961,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: .cfi_offset s2, -12
; RV32-NEXT: mulhu a5, a0, a2
; RV32-NEXT: mul a6, a1, a2
; RV32-NEXT: add a5, a6, a5
@@ -980,34 +978,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: mul t0, a1, a3
; RV32-NEXT: add t1, t0, a7
; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: and t3, t2, a2
+; RV32-NEXT: mul t3, a2, t2
; RV32-NEXT: srai t4, a3, 31
-; RV32-NEXT: and t5, t4, a0
-; RV32-NEXT: neg t6, t5
-; RV32-NEXT: sub s0, t6, t3
-; RV32-NEXT: add s1, t1, s0
-; RV32-NEXT: sltu s2, s1, t1
+; RV32-NEXT: mul t5, t4, a0
+; RV32-NEXT: add t6, t5, t3
+; RV32-NEXT: add s0, t1, t6
+; RV32-NEXT: sltu s1, s0, t1
; RV32-NEXT: sltu t0, t1, t0
; RV32-NEXT: sltu a6, a7, a6
; RV32-NEXT: mulhu a7, a1, a3
; RV32-NEXT: add a6, a7, a6
; RV32-NEXT: add a6, a6, t0
; RV32-NEXT: mulhu a7, a2, t2
-; RV32-NEXT: sub a7, a7, t3
-; RV32-NEXT: and a3, t2, a3
-; RV32-NEXT: sub a3, a7, a3
-; RV32-NEXT: and a1, t4, a1
+; RV32-NEXT: add a7, a7, t3
+; RV32-NEXT: mul a3, a3, t2
+; RV32-NEXT: add a3, a7, a3
+; RV32-NEXT: mul a1, t4, a1
; RV32-NEXT: mulhu a7, t4, a0
-; RV32-NEXT: sub a1, a7, a1
-; RV32-NEXT: sub a1, a1, t5
+; RV32-NEXT: add a1, a7, a1
+; RV32-NEXT: add a1, a1, t5
; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: sltu a3, s0, t6
+; RV32-NEXT: sltu a3, t6, t5
; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: add a1, a6, a1
-; RV32-NEXT: add a1, a1, s2
+; RV32-NEXT: add a1, a1, s1
; RV32-NEXT: srai a3, a5, 31
; RV32-NEXT: xor a1, a1, a3
-; RV32-NEXT: xor a3, s1, a3
+; RV32-NEXT: xor a3, s0, a3
; RV32-NEXT: or a1, a3, a1
; RV32-NEXT: snez a1, a1
; RV32-NEXT: mul a0, a0, a2
@@ -1016,7 +1013,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: mv a0, a1
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -1036,10 +1032,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
; RV32ZBA-NEXT: .cfi_offset s1, -8
-; RV32ZBA-NEXT: .cfi_offset s2, -12
; RV32ZBA-NEXT: mulhu a5, a0, a2
; RV32ZBA-NEXT: mul a6, a1, a2
; RV32ZBA-NEXT: add a5, a6, a5
@@ -1055,34 +1049,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: mul t0, a1, a3
; RV32ZBA-NEXT: add t1, t0, a7
; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: and t3, t2, a2
+; RV32ZBA-NEXT: mul t3, a2, t2
; RV32ZBA-NEXT: srai t4, a3, 31
-; RV32ZBA-NEXT: and t5, t4, a0
-; RV32ZBA-NEXT: neg t6, t5
-; RV32ZBA-NEXT: sub s0, t6, t3
-; RV32ZBA-NEXT: add s1, t1, s0
-; RV32ZBA-NEXT: sltu s2, s1, t1
+; RV32ZBA-NEXT: mul t5, t4, a0
+; RV32ZBA-NEXT: add t6, t5, t3
+; RV32ZBA-NEXT: add s0, t1, t6
+; RV32ZBA-NEXT: sltu s1, s0, t1
; RV32ZBA-NEXT: sltu t0, t1, t0
; RV32ZBA-NEXT: sltu a6, a7, a6
; RV32ZBA-NEXT: mulhu a7, a1, a3
; RV32ZBA-NEXT: add a6, a7, a6
; RV32ZBA-NEXT: add a6, a6, t0
; RV32ZBA-NEXT: mulhu a7, a2, t2
-; RV32ZBA-NEXT: sub a7, a7, t3
-; RV32ZBA-NEXT: and a3, t2, a3
-; RV32ZBA-NEXT: sub a3, a7, a3
-; RV32ZBA-NEXT: and a1, t4, a1
+; RV32ZBA-NEXT: add a7, a7, t3
+; RV32ZBA-NEXT: mul a3, a3, t2
+; RV32ZBA-NEXT: add a3, a7, a3
+; RV32ZBA-NEXT: mul a1, t4, a1
; RV32ZBA-NEXT: mulhu a7, t4, a0
-; RV32ZBA-NEXT: sub a1, a7, a1
-; RV32ZBA-NEXT: sub a1, a1, t5
+; RV32ZBA-NEXT: add a1, a7, a1
+; RV32ZBA-NEXT: add a1, a1, t5
; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: sltu a3, s0, t6
+; RV32ZBA-NEXT: sltu a3, t6, t5
; RV32ZBA-NEXT: add a1, a1, a3
; RV32ZBA-NEXT: add a1, a6, a1
-; RV32ZBA-NEXT: add a1, a1, s2
+; RV32ZBA-NEXT: add a1, a1, s1
; RV32ZBA-NEXT: srai a3, a5, 31
; RV32ZBA-NEXT: xor a1, a1, a3
-; RV32ZBA-NEXT: xor a3, s1, a3
+; RV32ZBA-NEXT: xor a3, s0, a3
; RV32ZBA-NEXT: or a1, a3, a1
; RV32ZBA-NEXT: snez a1, a1
; RV32ZBA-NEXT: mul a0, a0, a2
@@ -1091,7 +1084,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: mv a0, a1
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -1123,8 +1115,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: srai a1, a1, 31
-; RV32-NEXT: andi a6, a1, 13
-; RV32-NEXT: sub a6, a5, a6
+; RV32-NEXT: mul a6, a1, a3
+; RV32-NEXT: add a6, a5, a6
; RV32-NEXT: srai a7, a4, 31
; RV32-NEXT: xor t0, a6, a7
; RV32-NEXT: sltu a5, a6, a5
@@ -1160,8 +1152,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: srai a1, a1, 31
-; RV32ZBA-NEXT: andi a6, a1, 13
-; RV32ZBA-NEXT: sub a6, a5, a6
+; RV32ZBA-NEXT: mul a6, a1, a3
+; RV32ZBA-NEXT: add a6, a5, a6
; RV32ZBA-NEXT: srai a7, a4, 31
; RV32ZBA-NEXT: xor t0, a6, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
@@ -2360,9 +2352,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -2378,34 +2368,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: and t2, t1, a2
+; RV32-NEXT: mul t2, a2, t1
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: and t4, t3, a0
-; RV32-NEXT: neg t5, t4
-; RV32-NEXT: sub t6, t5, t2
-; RV32-NEXT: add s0, t0, t6
-; RV32-NEXT: sltu s1, s0, t0
+; RV32-NEXT: mul t4, t3, a0
+; RV32-NEXT: add t5, t4, t2
+; RV32-NEXT: add t6, t0, t5
+; RV32-NEXT: sltu s0, t6, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a6, a2, t1
-; RV32-NEXT: sub a6, a6, t2
-; RV32-NEXT: and a7, t1, a3
-; RV32-NEXT: sub a6, a6, a7
-; RV32-NEXT: and a7, t3, a1
+; RV32-NEXT: add a6, a6, t2
+; RV32-NEXT: mul a7, a3, t1
+; RV32-NEXT: add a6, a6, a7
+; RV32-NEXT: mul a7, t3, a1
; RV32-NEXT: mulhu t0, t3, a0
-; RV32-NEXT: sub a7, t0, a7
-; RV32-NEXT: sub a7, a7, t4
+; RV32-NEXT: add a7, t0, a7
+; RV32-NEXT: add a7, a7, t4
; RV32-NEXT: add a6, a7, a6
-; RV32-NEXT: sltu a7, t6, t5
+; RV32-NEXT: sltu a7, t5, t4
; RV32-NEXT: add a6, a6, a7
; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: add a5, a5, s1
+; RV32-NEXT: add a5, a5, s0
; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: xor a5, a5, a4
-; RV32-NEXT: xor a4, s0, a4
+; RV32-NEXT: xor a4, t6, a4
; RV32-NEXT: or a4, a4, a5
; RV32-NEXT: bnez a4, .LBB46_2
; RV32-NEXT: # %bb.1: # %entry
@@ -2413,7 +2402,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mv a1, a3
; RV32-NEXT: .LBB46_2: # %entry
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -2433,9 +2421,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
-; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -2451,34 +2437,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: and t2, t1, a2
+; RV32ZBA-NEXT: mul t2, a2, t1
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: and t4, t3, a0
-; RV32ZBA-NEXT: neg t5, t4
-; RV32ZBA-NEXT: sub t6, t5, t2
-; RV32ZBA-NEXT: add s0, t0, t6
-; RV32ZBA-NEXT: sltu s1, s0, t0
+; RV32ZBA-NEXT: mul t4, t3, a0
+; RV32ZBA-NEXT: add t5, t4, t2
+; RV32ZBA-NEXT: add t6, t0, t5
+; RV32ZBA-NEXT: sltu s0, t6, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a6, a2, t1
-; RV32ZBA-NEXT: sub a6, a6, t2
-; RV32ZBA-NEXT: and a7, t1, a3
-; RV32ZBA-NEXT: sub a6, a6, a7
-; RV32ZBA-NEXT: and a7, t3, a1
+; RV32ZBA-NEXT: add a6, a6, t2
+; RV32ZBA-NEXT: mul a7, a3, t1
+; RV32ZBA-NEXT: add a6, a6, a7
+; RV32ZBA-NEXT: mul a7, t3, a1
; RV32ZBA-NEXT: mulhu t0, t3, a0
-; RV32ZBA-NEXT: sub a7, t0, a7
-; RV32ZBA-NEXT: sub a7, a7, t4
+; RV32ZBA-NEXT: add a7, t0, a7
+; RV32ZBA-NEXT: add a7, a7, t4
; RV32ZBA-NEXT: add a6, a7, a6
-; RV32ZBA-NEXT: sltu a7, t6, t5
+; RV32ZBA-NEXT: sltu a7, t5, t4
; RV32ZBA-NEXT: add a6, a6, a7
; RV32ZBA-NEXT: add a5, a5, a6
-; RV32ZBA-NEXT: add a5, a5, s1
+; RV32ZBA-NEXT: add a5, a5, s0
; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: xor a5, a5, a4
-; RV32ZBA-NEXT: xor a4, s0, a4
+; RV32ZBA-NEXT: xor a4, t6, a4
; RV32ZBA-NEXT: or a4, a4, a5
; RV32ZBA-NEXT: bnez a4, .LBB46_2
; RV32ZBA-NEXT: # %bb.1: # %entry
@@ -2486,7 +2471,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mv a1, a3
; RV32ZBA-NEXT: .LBB46_2: # %entry
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -2513,9 +2497,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -2531,38 +2513,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: and t2, t1, a2
+; RV32-NEXT: mul t2, a2, t1
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: and t4, t3, a0
-; RV32-NEXT: neg t5, t4
-; RV32-NEXT: sub t6, t5, t2
-; RV32-NEXT: add s0, t0, t6
-; RV32-NEXT: sltu s1, s0, t0
+; RV32-NEXT: mul t4, t3, a0
+; RV32-NEXT: add t5, t4, t2
+; RV32-NEXT: add t6, t0, t5
+; RV32-NEXT: sltu s0, t6, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
-; RV32-NEXT: sub a2, a2, t2
-; RV32-NEXT: and a3, t1, a3
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: and a1, t3, a1
+; RV32-NEXT: add a2, a2, t2
+; RV32-NEXT: mul a3, a3, t1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: mul a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: sub a0, a0, t4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, a0, t4
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: sltu a1, t6, t5
+; RV32-NEXT: sltu a1, t5, t4
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: add a0, a0, s0
; RV32-NEXT: srai a1, a4, 31
; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: xor a1, s0, a1
+; RV32-NEXT: xor a1, t6, a1
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -2580,9 +2560,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
-; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -2598,38 +2576,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: and t2, t1, a2
+; RV32ZBA-NEXT: mul t2, a2, t1
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: and t4, t3, a0
-; RV32ZBA-NEXT: neg t5, t4
-; RV32ZBA-NEXT: sub t6, t5, t2
-; RV32ZBA-NEXT: add s0, t0, t6
-; RV32ZBA-NEXT: sltu s1, s0, t0
+; RV32ZBA-NEXT: mul t4, t3, a0
+; RV32ZBA-NEXT: add t5, t4, t2
+; RV32ZBA-NEXT: add t6, t0, t5
+; RV32ZBA-NEXT: sltu s0, t6, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
-; RV32ZBA-NEXT: sub a2, a2, t2
-; RV32ZBA-NEXT: and a3, t1, a3
-; RV32ZBA-NEXT: sub a2, a2, a3
-; RV32ZBA-NEXT: and a1, t3, a1
+; RV32ZBA-NEXT: add a2, a2, t2
+; RV32ZBA-NEXT: mul a3, a3, t1
+; RV32ZBA-NEXT: add a2, a2, a3
+; RV32ZBA-NEXT: mul a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: sub a0, a0, a1
-; RV32ZBA-NEXT: sub a0, a0, t4
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: add a0, a0, t4
; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: sltu a1, t6, t5
+; RV32ZBA-NEXT: sltu a1, t5, t4
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
-; RV32ZBA-NEXT: add a0, a0, s1
+; RV32ZBA-NEXT: add a0, a0, s0
; RV32ZBA-NEXT: srai a1, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a1
-; RV32ZBA-NEXT: xor a1, s0, a1
+; RV32ZBA-NEXT: xor a1, t6, a1
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: seqz a0, a0
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -3477,9 +3453,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -3495,34 +3469,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: and t2, t1, a2
+; RV32-NEXT: mul t2, a2, t1
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: and t4, t3, a0
-; RV32-NEXT: neg t5, t4
-; RV32-NEXT: sub t6, t5, t2
-; RV32-NEXT: add s0, t0, t6
-; RV32-NEXT: sltu s1, s0, t0
+; RV32-NEXT: mul t4, t3, a0
+; RV32-NEXT: add t5, t4, t2
+; RV32-NEXT: add t6, t0, t5
+; RV32-NEXT: sltu s0, t6, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
-; RV32-NEXT: sub a2, a2, t2
-; RV32-NEXT: and a3, t1, a3
-; RV32-NEXT: sub a2, a2, a3
-; RV32-NEXT: and a1, t3, a1
+; RV32-NEXT: add a2, a2, t2
+; RV32-NEXT: mul a3, a3, t1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: mul a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: sub a0, a0, t4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, a0, t4
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: sltu a1, t6, t5
+; RV32-NEXT: sltu a1, t5, t4
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: add a0, a0, s0
; RV32-NEXT: srai a1, a4, 31
; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: xor a1, s0, a1
+; RV32-NEXT: xor a1, t6, a1
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: beqz a0, .LBB61_2
; RV32-NEXT: # %bb.1: # %overflow
@@ -3532,7 +3505,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB61_3: # %overflow
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -3554,9 +3526,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
-; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -3572,34 +3542,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: and t2, t1, a2
+; RV32ZBA-NEXT: mul t2, a2, t1
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: and t4, t3, a0
-; RV32ZBA-NEXT: neg t5, t4
-; RV32ZBA-NEXT: sub t6, t5, t2
-; RV32ZBA-NEXT: add s0, t0, t6
-; RV32ZBA-NEXT: sltu s1, s0, t0
+; RV32ZBA-NEXT: mul t4, t3, a0
+; RV32ZBA-NEXT: add t5, t4, t2
+; RV32ZBA-NEXT: add t6, t0, t5
+; RV32ZBA-NEXT: sltu s0, t6, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
-; RV32ZBA-NEXT: sub a2, a2, t2
-; RV32ZBA-NEXT: and a3, t1, a3
-; RV32ZBA-NEXT: sub a2, a2, a3
-; RV32ZBA-NEXT: and a1, t3, a1
+; RV32ZBA-NEXT: add a2, a2, t2
+; RV32ZBA-NEXT: mul a3, a3, t1
+; RV32ZBA-NEXT: add a2, a2, a3
+; RV32ZBA-NEXT: mul a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: sub a0, a0, a1
-; RV32ZBA-NEXT: sub a0, a0, t4
+; RV32ZBA-NEXT: add a0, a0, a1
+; RV32ZBA-NEXT: add a0, a0, t4
; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: sltu a1, t6, t5
+; RV32ZBA-NEXT: sltu a1, t5, t4
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
-; RV32ZBA-NEXT: add a0, a0, s1
+; RV32ZBA-NEXT: add a0, a0, s0
; RV32ZBA-NEXT: srai a1, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a1
-; RV32ZBA-NEXT: xor a1, s0, a1
+; RV32ZBA-NEXT: xor a1, t6, a1
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: beqz a0, .LBB61_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
@@ -3609,7 +3578,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: li a0, 1
; RV32ZBA-NEXT: .LBB61_3: # %overflow
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -3657,8 +3625,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32-NEXT: add a6, a4, a6
; RV32-NEXT: sub t1, a6, a1
; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: andi t3, t2, -13
-; RV32-NEXT: sub t3, a5, t3
+; RV32-NEXT: mul t3, t2, a2
+; RV32-NEXT: sub t3, t3, a0
; RV32-NEXT: add t4, t1, t3
; RV32-NEXT: sltu t5, t4, t1
; RV32-NEXT: neg t6, a1
@@ -3719,8 +3687,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32ZBA-NEXT: add a6, a4, a6
; RV32ZBA-NEXT: sub t1, a6, a1
; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: andi t3, t2, -13
-; RV32ZBA-NEXT: sub t3, a5, t3
+; RV32ZBA-NEXT: mul t3, t2, a2
+; RV32ZBA-NEXT: sub t3, t3, a0
; RV32ZBA-NEXT: add t4, t1, t3
; RV32ZBA-NEXT: sltu t5, t4, t1
; RV32ZBA-NEXT: neg t6, a1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 9cb0ec4d98fb5..217caeebe6335 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -38,23 +38,22 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_0246_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: umull r2, r4, r3, r0
+; CHECK-NEXT: umull r2, r5, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: and.w r2, r1, r0, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r0, r1, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r3, r0, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r0, r0, r3, asr #31
-; CHECK-NEXT: subs r0, r2, r0
+; CHECK-NEXT: asrs r2, r0, #31
+; CHECK-NEXT: mla r4, r1, r2, r12
+; CHECK-NEXT: asrs r1, r1, #31
+; CHECK-NEXT: mla r2, r3, r2, r5
+; CHECK-NEXT: asrs r3, r3, #31
+; CHECK-NEXT: mla r1, r1, r0, r4
+; CHECK-NEXT: mla r0, r3, r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -68,23 +67,22 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_0246:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: asrs r4, r0, #31
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: umull r2, r4, r0, r3
+; CHECK-NEXT: umull r2, r5, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: and.w r2, r0, r1, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r1, r0, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r0, r3, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r0, r3, r0, asr #31
-; CHECK-NEXT: subs r0, r2, r0
+; CHECK-NEXT: asrs r2, r1, #31
+; CHECK-NEXT: mla r2, r0, r2, r12
+; CHECK-NEXT: mla r1, r4, r1, r2
+; CHECK-NEXT: asrs r2, r3, #31
+; CHECK-NEXT: mla r0, r0, r2, r5
+; CHECK-NEXT: mla r0, r4, r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -132,24 +130,23 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_1357_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vrev64.32 q1, q0
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: umull r2, r4, r3, r0
+; CHECK-NEXT: umull r2, r5, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: and.w r2, r1, r0, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r0, r1, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r3, r0, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r0, r0, r3, asr #31
-; CHECK-NEXT: subs r0, r2, r0
+; CHECK-NEXT: asrs r2, r0, #31
+; CHECK-NEXT: mla r4, r1, r2, r12
+; CHECK-NEXT: asrs r1, r1, #31
+; CHECK-NEXT: mla r2, r3, r2, r5
+; CHECK-NEXT: asrs r3, r3, #31
+; CHECK-NEXT: mla r1, r1, r0, r4
+; CHECK-NEXT: mla r0, r3, r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -163,24 +160,23 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_1357:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vrev64.32 q1, q0
+; CHECK-NEXT: asrs r4, r0, #31
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: umull r2, r4, r0, r3
+; CHECK-NEXT: umull r2, r5, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: and.w r2, r0, r1, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r1, r0, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r0, r3, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r0, r3, r0, asr #31
-; CHECK-NEXT: subs r0, r2, r0
+; CHECK-NEXT: asrs r2, r1, #31
+; CHECK-NEXT: mla r2, r0, r2, r12
+; CHECK-NEXT: mla r1, r4, r1, r2
+; CHECK-NEXT: asrs r2, r3, #31
+; CHECK-NEXT: mla r0, r0, r2, r5
+; CHECK-NEXT: mla r0, r4, r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -234,39 +230,36 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_0213_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vmov.f32 s6, s3
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: umull r2, r5, r3, r0
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: umull r2, r4, r3, r0
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: and.w r2, r1, r0, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r0, r1, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r3, r0, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r3, r0, r3, asr #31
-; CHECK-NEXT: subs r2, r2, r3
-; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT: asrs r2, r0, #31
+; CHECK-NEXT: mla r4, r1, r2, r12
+; CHECK-NEXT: asrs r1, r1, #31
+; CHECK-NEXT: mla r5, r3, r2, r5
+; CHECK-NEXT: asrs r3, r3, #31
+; CHECK-NEXT: mla r1, r1, r0, r4
+; CHECK-NEXT: mla r3, r3, r0, r5
+; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: and.w r2, r1, r0, asr #31
-; CHECK-NEXT: umull r3, r4, r1, r0
-; CHECK-NEXT: and.w r1, r0, r1, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: sub.w r12, r2, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: umull r4, r1, r2, r0
+; CHECK-NEXT: umull r3, r5, r1, r0
+; CHECK-NEXT: mla r5, r1, r2, r5
+; CHECK-NEXT: asrs r1, r1, #31
+; CHECK-NEXT: mla r12, r1, r0, r5
+; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: umull r4, r1, r5, r0
+; CHECK-NEXT: mla r1, r5, r2, r1
+; CHECK-NEXT: asrs r2, r5, #31
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT: and.w r3, r2, r0, asr #31
-; CHECK-NEXT: and.w r0, r0, r2, asr #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: mla r0, r2, r0, r1
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -280,39 +273,36 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_0213:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vmov.f32 s6, s3
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: asrs r4, r0, #31
+; CHECK-NEXT: vmov.f32 s6, s3
; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: umull r2, r5, r0, r3
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: umull r2, r4, r0, r3
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: and.w r2, r0, r1, asr #31
-; CHECK-NEXT: sub.w r2, r12, r2
-; CHECK-NEXT: and.w r1, r1, r0, asr #31
-; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: and.w r2, r0, r3, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: and.w r3, r3, r0, asr #31
-; CHECK-NEXT: subs r2, r2, r3
+; CHECK-NEXT: asrs r2, r1, #31
+; CHECK-NEXT: mla r2, r0, r2, r12
+; CHECK-NEXT: mla r1, r4, r1, r2
+; CHECK-NEXT: asrs r2, r3, #31
+; CHECK-NEXT: mla r2, r0, r2, r5
+; CHECK-NEXT: mla r2, r4, r3, r2
; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r3, r4, r0, r1
-; CHECK-NEXT: and.w r2, r0, r1, asr #31
-; CHECK-NEXT: and.w r1, r1, r0, asr #31
-; CHECK-NEXT: subs r2, r4, r2
-; CHECK-NEXT: sub.w r12, r2, r1
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: umull r4, r1, r0, r2
-; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT: and.w r3, r0, r2, asr #31
-; CHECK-NEXT: and.w r0, r2, r0, asr #31
-; CHECK-NEXT: subs r1, r1, r3
-; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: umull r2, r3, r0, r1
+; CHECK-NEXT: asrs r5, r1, #31
+; CHECK-NEXT: mla r3, r0, r5, r3
+; CHECK-NEXT: mla r12, r4, r1, r3
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: umull r5, r1, r0, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT: asrs r2, r3, #31
+; CHECK-NEXT: mla r0, r0, r2, r1
+; CHECK-NEXT: mla r0, r4, r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
diff --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll
index a2d8211888618..a7f2959a23c2c 100644
--- a/llvm/test/CodeGen/X86/extmul128.ll
+++ b/llvm/test/CodeGen/X86/extmul128.ll
@@ -29,37 +29,6 @@ define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: sarq $63, %rsi
-; CHECK-NEXT: andq %rdi, %rsi
-; CHECK-NEXT: subq %rsi, %rdx
-; CHECK-NEXT: retq
- %aa = zext i64 %a to i128
- %bb = sext i64 %b to i128
- %cc = mul i128 %aa, %bb
- ret i128 %cc
-}
-
-define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
-; CHECK-LABEL: i64_sext_zext_i128:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq %rdi, %rcx
-; CHECK-NEXT: sarq $63, %rcx
-; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: andq %rsi, %rcx
-; CHECK-NEXT: subq %rcx, %rdx
-; CHECK-NEXT: retq
- %aa = sext i64 %a to i128
- %bb = zext i64 %b to i128
- %cc = mul i128 %aa, %bb
- ret i128 %cc
-}
-
-define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
-; CHECK-LABEL: i64_zext_sext_i128_minsize:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: sarq $63, %rsi
; CHECK-NEXT: imulq %rdi, %rsi
; CHECK-NEXT: addq %rsi, %rdx
; CHECK-NEXT: retq
@@ -69,8 +38,8 @@ define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
ret i128 %cc
}
-define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize {
-; CHECK-LABEL: i64_sext_zext_i128_minsize:
+define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
+; CHECK-LABEL: i64_sext_zext_i128:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq %rdi, %rcx
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 3733306f354a5..9a6cf0b065662 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -7,39 +7,34 @@
define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
; CHECK-LABEL: x:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pushq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset %rbx, -32
-; CHECK-NEXT: .cfi_offset %r14, -24
-; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %r14, -16
; CHECK-NEXT: movq %rdx, %r11
; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: movq %rsi, %rdi
-; CHECK-NEXT: sarq $63, %rdi
-; CHECK-NEXT: movq %rdi, %r10
-; CHECK-NEXT: andq %rdx, %r10
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: sarq $63, %rbx
+; CHECK-NEXT: movq %rdx, %rdi
+; CHECK-NEXT: imulq %rbx, %rdi
; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: mulq %rdi
+; CHECK-NEXT: mulq %rbx
; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: subq %r10, %rbx
-; CHECK-NEXT: andq %rcx, %rdi
-; CHECK-NEXT: subq %rdi, %rbx
-; CHECK-NEXT: movq %rcx, %r14
-; CHECK-NEXT: sarq $63, %r14
-; CHECK-NEXT: movq %r14, %r15
-; CHECK-NEXT: andq %rsi, %r15
-; CHECK-NEXT: movq %r14, %rax
+; CHECK-NEXT: addq %rdi, %rdx
+; CHECK-NEXT: imulq %rcx, %rbx
+; CHECK-NEXT: addq %rdx, %rbx
+; CHECK-NEXT: movq %rcx, %rdi
+; CHECK-NEXT: sarq $63, %rdi
+; CHECK-NEXT: movq %rdi, %r14
+; CHECK-NEXT: imulq %rsi, %r14
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %r9
; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: movq %rdx, %rdi
-; CHECK-NEXT: subq %r15, %rdi
-; CHECK-NEXT: andq %r9, %r14
-; CHECK-NEXT: subq %r14, %rdi
+; CHECK-NEXT: addq %r14, %rdx
+; CHECK-NEXT: imulq %r9, %rdi
+; CHECK-NEXT: addq %rdx, %rdi
; CHECK-NEXT: addq %r8, %r10
; CHECK-NEXT: adcq %rbx, %rdi
; CHECK-NEXT: movq %r9, %rax
@@ -77,7 +72,6 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: movq %r9, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
; CHECK-NEXT: retq
; CHECK-NEXT: LBB0_1: ## %overflow
; CHECK-NEXT: ud2
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 07debb11b92f7..996601ed3be64 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: .cfi_def_cfa_offset 32
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 28
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -378,54 +378,52 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: subl %ebx, %esi
-; X86-NEXT: andl %ebp, %edi
-; X86-NEXT: subl %edi, %esi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: addl %edi, %edx
; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: imull %ebp, %ebx
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: andl %ecx, %ebp
+; X86-NEXT: imull %ecx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: subl %ebp, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edx, %edi
-; X86-NEXT: subl %edi, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: imull %esi, %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %bl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: movzbl %bl, %esi
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl %edi, %edx
; X86-NEXT: movl %ebp, %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: xorl %edi, %edx
@@ -436,11 +434,11 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF
; X86-NEXT: orl %edx, %edi
; X86-NEXT: notl %ecx
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload
; X86-NEXT: cmovel %ebp, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %esi, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 6631c6c4cc014..367ca660cda14 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -9,44 +9,39 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
; X64-NEXT: .cfi_def_cfa_offset 24
-; X64-NEXT: pushq %r12
-; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: pushq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 40
-; X64-NEXT: .cfi_offset %rbx, -40
-; X64-NEXT: .cfi_offset %r12, -32
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rdi, %r10
-; X64-NEXT: movq %rsi, %r9
-; X64-NEXT: sarq $63, %r9
-; X64-NEXT: movq %r9, %r11
-; X64-NEXT: andq %rdx, %r11
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: imulq %r14, %rdi
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: subq %r11, %r14
-; X64-NEXT: andq %rcx, %r9
-; X64-NEXT: subq %r9, %r14
-; X64-NEXT: movq %rcx, %r15
-; X64-NEXT: sarq $63, %r15
-; X64-NEXT: movq %r15, %r12
-; X64-NEXT: andq %rsi, %r12
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: imulq %rcx, %r14
+; X64-NEXT: addq %rdx, %r14
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: imulq %rsi, %r15
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: subq %r12, %r9
-; X64-NEXT: andq %r10, %r15
-; X64-NEXT: subq %r15, %r9
-; X64-NEXT: addq %rdi, %r11
-; X64-NEXT: adcq %r14, %r9
+; X64-NEXT: addq %r15, %rdx
+; X64-NEXT: imulq %r10, %rdi
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: addq %r9, %r11
+; X64-NEXT: adcq %r14, %rdi
; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rax, %r9
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
@@ -66,16 +61,15 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: addq %r14, %rax
; X64-NEXT: adcq %rbx, %rdx
; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %r9, %rdx
+; X64-NEXT: adcq %rdi, %rdx
; X64-NEXT: movq %r10, 8(%r8)
; X64-NEXT: sarq $63, %r10
; X64-NEXT: xorq %r10, %rdx
; X64-NEXT: xorq %rax, %r10
; X64-NEXT: orq %rdx, %r10
; X64-NEXT: setne %al
-; X64-NEXT: movq %rdi, (%r8)
+; X64-NEXT: movq %r9, (%r8)
; X64-NEXT: popq %rbx
-; X64-NEXT: popq %r12
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
; X64-NEXT: retq
@@ -90,8 +84,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $60, %esp
-; X86-NEXT: .cfi_def_cfa_offset 80
+; X86-NEXT: subl $56, %esp
+; X86-NEXT: .cfi_def_cfa_offset 76
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -105,229 +99,226 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl %eax, %ebp
; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl %edi, %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: imull %esi, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: subl %esi, %edi
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: subl %ebx, %edi
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: imull %esi, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: subl %esi, %ebp
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: subl %eax, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %esi, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl (%esp), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: mull %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: subl %ecx, %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %eax, %ebx
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: imull %ebx, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull %ebx, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull %ebx, %ecx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: subl %esi, %ecx
+; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: imull %ebx, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: xorl %esi, %ecx
-; X86-NEXT: orl %ebp, %ecx
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -335,7 +326,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $60, %esp
+; X86-NEXT: addl $56, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -369,239 +360,234 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %rcx, %r14
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rsi, %r10
-; X64-NEXT: movq %rdi, %r11
+; X64-NEXT: movq %rcx, %r11
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rsi, %r15
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rsi, %r10
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %rbx, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rbx
-; X64-NEXT: adcq %rsi, %r12
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r10, %r14
+; X64-NEXT: adcq %rcx, %r12
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %edi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %r9, %rcx
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r12, %rsi
-; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movzbl %al, %ecx
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r8, %rdi
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r12, %rbx
+; X64-NEXT: adcq %rcx, %r11
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %r8, %rcx
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %r8, %r13
; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r12, %rdi
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: adcq %r12, %r10
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: addq %r10, %r8
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT: adcq %rbx, %rbp
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r15, %r12
+; X64-NEXT: adcq %r14, %rbp
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: adcq $0, %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, %rcx
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rdi, %r10
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %r10, %r9
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movq %rax, %rdi
; X64-NEXT: adcq %r13, %r11
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %r11, %r13
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: addq %r8, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rbp, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r10
+; X64-NEXT: addq %r8, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %rbp, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: adcq %r12, %rdi
-; X64-NEXT: setb %r11b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r15
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: addq %rbx, %r13
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
+; X64-NEXT: setb %cl
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r11
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: addq %r8, %rax
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: adcq %rdi, %r9
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r10, %rbx
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r9, %r14
+; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: addq %r13, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rdi, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movzbl %r11b, %eax
-; X64-NEXT: adcq %rax, %rbx
+; X64-NEXT: addq %r13, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r10, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: adcq %rax, %r14
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: movq %rbx, %r10
; X64-NEXT: sarq $63, %r13
; X64-NEXT: movq %r13, %rcx
-; X64-NEXT: andq %r9, %rcx
+; X64-NEXT: imulq %r12, %rcx
; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r14
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: subq %rcx, %r10
-; X64-NEXT: andq %r13, %r14
-; X64-NEXT: subq %r14, %r10
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT: andq %r14, %rsi
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r13, %r15
+; X64-NEXT: addq %rdx, %r15
+; X64-NEXT: movq %r13, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: subq %rsi, %rcx
-; X64-NEXT: andq %r13, %rdi
-; X64-NEXT: subq %rdi, %rcx
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: imulq %r13, %rsi
+; X64-NEXT: addq %rcx, %rsi
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: addq %rax, %r8
-; X64-NEXT: adcq %r10, %rcx
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: adcq %r15, %rsi
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r13
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r11, %r15
+; X64-NEXT: addq %r9, %r15
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: addq %rsi, %r15
-; X64-NEXT: adcq %r11, %r13
-; X64-NEXT: setb %sil
+; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: adcq %r9, %r13
+; X64-NEXT: setb %cl
; X64-NEXT: addq %rax, %r13
-; X64-NEXT: movzbl %sil, %esi
-; X64-NEXT: adcq %rdx, %rsi
+; X64-NEXT: movzbl %cl, %r9d
+; X64-NEXT: adcq %rdx, %r9
; X64-NEXT: addq %r8, %r13
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: sarq $63, %r9
-; X64-NEXT: movq %r9, %r8
+; X64-NEXT: adcq %rsi, %r9
+; X64-NEXT: sarq $63, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: andq %rax, %r8
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: imulq %r12, %r8
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: subq %r8, %r14
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: addq %rdx, %r8
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: andq %rdi, %rax
-; X64-NEXT: subq %rax, %r14
-; X64-NEXT: movq %r9, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: andq %rax, %r12
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: subq %r12, %r8
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: imulq %r12, %rbx
+; X64-NEXT: addq %r8, %rbx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: andq %r9, %rax
-; X64-NEXT: subq %rax, %r8
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: adcq %r14, %r8
-; X64-NEXT: movq %rcx, %r14
-; X64-NEXT: addq %r11, %r14
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r12, %r10
+; X64-NEXT: addq %rdx, %r10
+; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: adcq %rbx, %r10
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: addq %r11, %rbx
; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: addq %rax, %r14
+; X64-NEXT: addq %rax, %rbx
; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: setb %r9b
+; X64-NEXT: setb %cl
; X64-NEXT: addq %rax, %r11
-; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movzbl %cl, %eax
; X64-NEXT: adcq %rdx, %rax
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: adcq %r8, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %r14
+; X64-NEXT: addq %r8, %r11
+; X64-NEXT: adcq %r10, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq %r15, %rbx
; X64-NEXT: adcq %r13, %r11
-; X64-NEXT: adcq %rsi, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
-; X64-NEXT: adcq %rbx, %r11
+; X64-NEXT: adcq %r9, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
+; X64-NEXT: adcq %r14, %r11
; X64-NEXT: adcq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT: movq %rsi, %rdx
-; X64-NEXT: sarq $63, %rdx
-; X64-NEXT: xorq %rdx, %rax
-; X64-NEXT: xorq %rdx, %r14
-; X64-NEXT: orq %rax, %r14
-; X64-NEXT: xorq %rdx, %r11
-; X64-NEXT: xorq %rcx, %rdx
-; X64-NEXT: orq %r11, %rdx
-; X64-NEXT: orq %r14, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: sarq $63, %rcx
+; X64-NEXT: xorq %rcx, %rax
+; X64-NEXT: xorq %rcx, %rbx
+; X64-NEXT: orq %rax, %rbx
+; X64-NEXT: xorq %rcx, %r11
+; X64-NEXT: xorq %rsi, %rcx
+; X64-NEXT: orq %r11, %rcx
+; X64-NEXT: orq %rbx, %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %rsi, 24(%rax)
+; X64-NEXT: movq %rdx, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -627,399 +613,400 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $152, %esp
-; X86-NEXT: .cfi_def_cfa_offset 172
+; X86-NEXT: subl $156, %esp
+; X86-NEXT: .cfi_def_cfa_offset 176
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl (%esp), %edx ## 4-byte Reload
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: adcl %ecx, %ebp
+; X86-NEXT: adcl %ebp, %edi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: adcl $0, %edx
@@ -1032,9 +1019,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %esi
@@ -1047,13 +1034,41 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
@@ -1062,117 +1077,89 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl %ebx, %esi
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1188,25 +1175,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: movzbl %cl, %eax
@@ -1214,75 +1201,76 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: setb %al
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edx, %ecx
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %eax, %esi
-; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull %edi, %esi
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull %edi, %ecx
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: subl %ebx, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %ebx
; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %ebp, %esi
@@ -1292,266 +1280,263 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: setb %bl
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edi, %edx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: subl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edi, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: addl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: andl %edi, %edx
+; X86-NEXT: imull %edi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl (%esp), %edx ## 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: subl %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl %ebx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: adcl %eax, %edx
; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: adcl %edx, %esi
; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movzbl %bl, %ebp
-; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movzbl %bl, %ebx
+; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: adcl %ebp, %eax
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: imull %ebp, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: imull %ebp, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: imull %ebp, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: imull %ebp, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %dl
; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movzbl %dl, %ecx
-; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movzbl %dl, %eax
+; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: andl %ebp, %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: imull %ebp, %ecx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: subl %esi, %ebx
-; X86-NEXT: andl %ebp, %ecx
-; X86-NEXT: subl %ecx, %ebx
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: imull %ebp, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: subl %ecx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: subl %eax, %ebp
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: imull %ebp, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %edx, %eax
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: xorl %ecx, %edi
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: xorl %edi, %esi
; X86-NEXT: orl %edx, %esi
+; X86-NEXT: xorl %edi, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: xorl %edi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: xorl %edi, %ebp
+; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: orl %ebp, %edi
+; X86-NEXT: orl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 28(%eax)
+; X86-NEXT: movl %ebx, 28(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1567,7 +1552,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 24(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $152, %esp
+; X86-NEXT: addl $156, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 641663d9eedfe..dbec86755a969 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,33 +3297,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %r8, %r14
-; SSE2-NEXT: movq %rcx, %rbp
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rsi, %r11
; SSE2-NEXT: movq %rdi, %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movq %r11, %rbx
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rbx, %r15
-; SSE2-NEXT: andq %r14, %r15
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT: movq %r11, %r12
+; SSE2-NEXT: sarq $63, %r12
+; SSE2-NEXT: movq %r14, %rbx
+; SSE2-NEXT: imulq %r12, %rbx
; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: mulq %rbx
+; SSE2-NEXT: mulq %r12
; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: movq %rdx, %r12
-; SSE2-NEXT: subq %r15, %r12
-; SSE2-NEXT: andq %r9, %rbx
-; SSE2-NEXT: subq %rbx, %r12
-; SSE2-NEXT: movq %r9, %r13
-; SSE2-NEXT: sarq $63, %r13
-; SSE2-NEXT: movq %r13, %rcx
-; SSE2-NEXT: andq %r11, %rcx
-; SSE2-NEXT: movq %r13, %rax
+; SSE2-NEXT: addq %rbx, %rdx
+; SSE2-NEXT: imulq %r9, %r12
+; SSE2-NEXT: addq %rdx, %r12
+; SSE2-NEXT: movq %r9, %rbx
+; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: movq %rbx, %r13
+; SSE2-NEXT: imulq %r11, %r13
+; SSE2-NEXT: movq %rbx, %rax
; SSE2-NEXT: mulq %r10
; SSE2-NEXT: movq %rax, %r15
-; SSE2-NEXT: movq %rdx, %rbx
-; SSE2-NEXT: subq %rcx, %rbx
-; SSE2-NEXT: andq %r10, %r13
-; SSE2-NEXT: subq %r13, %rbx
+; SSE2-NEXT: addq %r13, %rdx
+; SSE2-NEXT: imulq %r10, %rbx
+; SSE2-NEXT: addq %rdx, %rbx
; SSE2-NEXT: addq %rdi, %r15
; SSE2-NEXT: adcq %r12, %rbx
; SSE2-NEXT: movq %r10, %rax
@@ -3343,11 +3341,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: addq %r13, %r10
; SSE2-NEXT: adcq %r14, %r12
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %ecx
+; SSE2-NEXT: movzbl %al, %r14d
; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r9
; SSE2-NEXT: addq %r12, %rax
-; SSE2-NEXT: adcq %rcx, %rdx
+; SSE2-NEXT: adcq %r14, %rdx
; SSE2-NEXT: addq %r15, %rax
; SSE2-NEXT: adcq %rbx, %rdx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3358,56 +3356,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: xorl %r15d, %r15d
; SSE2-NEXT: orq %rdx, %r10
; SSE2-NEXT: setne %r15b
-; SSE2-NEXT: movq %rbp, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movq %rcx, %r11
-; SSE2-NEXT: andq %rsi, %r11
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: movq %rsi, %r10
+; SSE2-NEXT: imulq %rbx, %r10
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: mulq %rbx
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: movq %rdx, %r10
-; SSE2-NEXT: subq %r11, %r10
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andq %rax, %rcx
-; SSE2-NEXT: subq %rcx, %r10
-; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: movq %rax, %r13
-; SSE2-NEXT: sarq $63, %r11
-; SSE2-NEXT: movq %r11, %rcx
-; SSE2-NEXT: andq %rbp, %rcx
-; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: addq %r10, %rdx
+; SSE2-NEXT: imulq %rbp, %rbx
+; SSE2-NEXT: addq %rdx, %rbx
+; SSE2-NEXT: movq %rbp, %r10
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: movq %r10, %r14
+; SSE2-NEXT: imulq %rcx, %r14
+; SSE2-NEXT: movq %r10, %rax
; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: movq %rdx, %r14
-; SSE2-NEXT: subq %rcx, %r14
-; SSE2-NEXT: andq %r8, %r11
-; SSE2-NEXT: subq %r11, %r14
-; SSE2-NEXT: addq %r9, %rbx
-; SSE2-NEXT: adcq %r10, %r14
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: addq %r14, %rdx
+; SSE2-NEXT: imulq %r8, %r10
+; SSE2-NEXT: addq %rdx, %r10
+; SSE2-NEXT: addq %r9, %r11
+; SSE2-NEXT: adcq %rbx, %r10
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: movq %rbp, %rax
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: addq %r9, %r11
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: addq %r9, %r14
; SSE2-NEXT: adcq $0, %rsi
; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %r13
+; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %r11, %r9
+; SSE2-NEXT: addq %r14, %r9
; SSE2-NEXT: adcq %rsi, %r8
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %ecx
-; SSE2-NEXT: movq %rbp, %rax
-; SSE2-NEXT: mulq %r13
+; SSE2-NEXT: movzbl %al, %esi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rbp
; SSE2-NEXT: addq %r8, %rax
-; SSE2-NEXT: adcq %rcx, %rdx
-; SSE2-NEXT: addq %rbx, %rax
-; SSE2-NEXT: adcq %r14, %rdx
+; SSE2-NEXT: adcq %rsi, %rdx
+; SSE2-NEXT: addq %r11, %rax
+; SSE2-NEXT: adcq %r10, %rdx
; SSE2-NEXT: movq %r9, 24(%r12)
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: xorq %r9, %rdx
@@ -3420,7 +3414,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: negl %r15d
; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %r10, 16(%r12)
+; SSE2-NEXT: movq %rbx, 16(%r12)
; SSE2-NEXT: movq %rdi, (%r12)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
@@ -3439,33 +3433,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movq %r8, %r14
-; SSSE3-NEXT: movq %rcx, %rbp
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rsi, %r11
; SSSE3-NEXT: movq %rdi, %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSSE3-NEXT: movq %r11, %rbx
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rbx, %r15
-; SSSE3-NEXT: andq %r14, %r15
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSSE3-NEXT: movq %r11, %r12
+; SSSE3-NEXT: sarq $63, %r12
+; SSSE3-NEXT: movq %r14, %rbx
+; SSSE3-NEXT: imulq %r12, %rbx
; SSSE3-NEXT: movq %r14, %rax
-; SSSE3-NEXT: mulq %rbx
+; SSSE3-NEXT: mulq %r12
; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: movq %rdx, %r12
-; SSSE3-NEXT: subq %r15, %r12
-; SSSE3-NEXT: andq %r9, %rbx
-; SSSE3-NEXT: subq %rbx, %r12
-; SSSE3-NEXT: movq %r9, %r13
-; SSSE3-NEXT: sarq $63, %r13
-; SSSE3-NEXT: movq %r13, %rcx
-; SSSE3-NEXT: andq %r11, %rcx
-; SSSE3-NEXT: movq %r13, %rax
+; SSSE3-NEXT: addq %rbx, %rdx
+; SSSE3-NEXT: imulq %r9, %r12
+; SSSE3-NEXT: addq %rdx, %r12
+; SSSE3-NEXT: movq %r9, %rbx
+; SSSE3-NEXT: sarq $63, %rbx
+; SSSE3-NEXT: movq %rbx, %r13
+; SSSE3-NEXT: imulq %r11, %r13
+; SSSE3-NEXT: movq %rbx, %rax
; SSSE3-NEXT: mulq %r10
; SSSE3-NEXT: movq %rax, %r15
-; SSSE3-NEXT: movq %rdx, %rbx
-; SSSE3-NEXT: subq %rcx, %rbx
-; SSSE3-NEXT: andq %r10, %r13
-; SSSE3-NEXT: subq %r13, %rbx
+; SSSE3-NEXT: addq %r13, %rdx
+; SSSE3-NEXT: imulq %r10, %rbx
+; SSSE3-NEXT: addq %rdx, %rbx
; SSSE3-NEXT: addq %rdi, %r15
; SSSE3-NEXT: adcq %r12, %rbx
; SSSE3-NEXT: movq %r10, %rax
@@ -3485,11 +3477,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: addq %r13, %r10
; SSSE3-NEXT: adcq %r14, %r12
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %ecx
+; SSSE3-NEXT: movzbl %al, %r14d
; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r9
; SSSE3-NEXT: addq %r12, %rax
-; SSSE3-NEXT: adcq %rcx, %rdx
+; SSSE3-NEXT: adcq %r14, %rdx
; SSSE3-NEXT: addq %r15, %rax
; SSSE3-NEXT: adcq %rbx, %rdx
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3500,56 +3492,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: xorl %r15d, %r15d
; SSSE3-NEXT: orq %rdx, %r10
; SSSE3-NEXT: setne %r15b
-; SSSE3-NEXT: movq %rbp, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movq %rcx, %r11
-; SSSE3-NEXT: andq %rsi, %r11
+; SSSE3-NEXT: movq %rcx, %rbx
+; SSSE3-NEXT: sarq $63, %rbx
+; SSSE3-NEXT: movq %rsi, %r10
+; SSSE3-NEXT: imulq %rbx, %r10
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: mulq %rcx
+; SSSE3-NEXT: mulq %rbx
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: movq %rdx, %r10
-; SSSE3-NEXT: subq %r11, %r10
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT: andq %rax, %rcx
-; SSSE3-NEXT: subq %rcx, %r10
-; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: movq %rax, %r13
-; SSSE3-NEXT: sarq $63, %r11
-; SSSE3-NEXT: movq %r11, %rcx
-; SSSE3-NEXT: andq %rbp, %rcx
-; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: addq %r10, %rdx
+; SSSE3-NEXT: imulq %rbp, %rbx
+; SSSE3-NEXT: addq %rdx, %rbx
+; SSSE3-NEXT: movq %rbp, %r10
+; SSSE3-NEXT: sarq $63, %r10
+; SSSE3-NEXT: movq %r10, %r14
+; SSSE3-NEXT: imulq %rcx, %r14
+; SSSE3-NEXT: movq %r10, %rax
; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: movq %rdx, %r14
-; SSSE3-NEXT: subq %rcx, %r14
-; SSSE3-NEXT: andq %r8, %r11
-; SSSE3-NEXT: subq %r11, %r14
-; SSSE3-NEXT: addq %r9, %rbx
-; SSSE3-NEXT: adcq %r10, %r14
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: addq %r14, %rdx
+; SSSE3-NEXT: imulq %r8, %r10
+; SSSE3-NEXT: addq %rdx, %r10
+; SSSE3-NEXT: addq %r9, %r11
+; SSSE3-NEXT: adcq %rbx, %r10
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: movq %rbp, %rax
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: movq %rcx, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: addq %r9, %r11
+; SSSE3-NEXT: movq %rax, %r14
+; SSSE3-NEXT: addq %r9, %r14
; SSSE3-NEXT: adcq $0, %rsi
; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %r13
+; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %r11, %r9
+; SSSE3-NEXT: addq %r14, %r9
; SSSE3-NEXT: adcq %rsi, %r8
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %ecx
-; SSSE3-NEXT: movq %rbp, %rax
-; SSSE3-NEXT: mulq %r13
+; SSSE3-NEXT: movzbl %al, %esi
+; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: mulq %rbp
; SSSE3-NEXT: addq %r8, %rax
-; SSSE3-NEXT: adcq %rcx, %rdx
-; SSSE3-NEXT: addq %rbx, %rax
-; SSSE3-NEXT: adcq %r14, %rdx
+; SSSE3-NEXT: adcq %rsi, %rdx
+; SSSE3-NEXT: addq %r11, %rax
+; SSSE3-NEXT: adcq %r10, %rdx
; SSSE3-NEXT: movq %r9, 24(%r12)
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: xorq %r9, %rdx
@@ -3562,7 +3550,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: negl %r15d
; SSSE3-NEXT: movd %r15d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %r10, 16(%r12)
+; SSSE3-NEXT: movq %rbx, 16(%r12)
; SSSE3-NEXT: movq %rdi, (%r12)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
@@ -3581,33 +3569,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: movq %r8, %r14
-; SSE41-NEXT: movq %rcx, %rbp
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rsi, %r11
; SSE41-NEXT: movq %rdi, %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT: movq %r11, %rbx
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rbx, %r15
-; SSE41-NEXT: andq %r14, %r15
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movq %r11, %r12
+; SSE41-NEXT: sarq $63, %r12
+; SSE41-NEXT: movq %r14, %rbx
+; SSE41-NEXT: imulq %r12, %rbx
; SSE41-NEXT: movq %r14, %rax
-; SSE41-NEXT: mulq %rbx
+; SSE41-NEXT: mulq %r12
; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: movq %rdx, %r12
-; SSE41-NEXT: subq %r15, %r12
-; SSE41-NEXT: andq %r9, %rbx
-; SSE41-NEXT: subq %rbx, %r12
-; SSE41-NEXT: movq %r9, %r13
-; SSE41-NEXT: sarq $63, %r13
-; SSE41-NEXT: movq %r13, %rcx
-; SSE41-NEXT: andq %r11, %rcx
-; SSE41-NEXT: movq %r13, %rax
+; SSE41-NEXT: addq %rbx, %rdx
+; SSE41-NEXT: imulq %r9, %r12
+; SSE41-NEXT: addq %rdx, %r12
+; SSE41-NEXT: movq %r9, %rbx
+; SSE41-NEXT: sarq $63, %rbx
+; SSE41-NEXT: movq %rbx, %r13
+; SSE41-NEXT: imulq %r11, %r13
+; SSE41-NEXT: movq %rbx, %rax
; SSE41-NEXT: mulq %r10
; SSE41-NEXT: movq %rax, %r15
-; SSE41-NEXT: movq %rdx, %rbx
-; SSE41-NEXT: subq %rcx, %rbx
-; SSE41-NEXT: andq %r10, %r13
-; SSE41-NEXT: subq %r13, %rbx
+; SSE41-NEXT: addq %r13, %rdx
+; SSE41-NEXT: imulq %r10, %rbx
+; SSE41-NEXT: addq %rdx, %rbx
; SSE41-NEXT: addq %rdi, %r15
; SSE41-NEXT: adcq %r12, %rbx
; SSE41-NEXT: movq %r10, %rax
@@ -3627,11 +3613,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: addq %r13, %r10
; SSE41-NEXT: adcq %r14, %r12
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %ecx
+; SSE41-NEXT: movzbl %al, %r14d
; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r9
; SSE41-NEXT: addq %r12, %rax
-; SSE41-NEXT: adcq %rcx, %rdx
+; SSE41-NEXT: adcq %r14, %rdx
; SSE41-NEXT: addq %r15, %rax
; SSE41-NEXT: adcq %rbx, %rdx
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3642,56 +3628,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: xorl %r15d, %r15d
; SSE41-NEXT: orq %rdx, %r10
; SSE41-NEXT: setne %r15b
-; SSE41-NEXT: movq %rbp, %rcx
-; SSE41-NEXT: sarq $63, %rcx
-; SSE41-NEXT: movq %rcx, %r11
-; SSE41-NEXT: andq %rsi, %r11
+; SSE41-NEXT: movq %rcx, %rbx
+; SSE41-NEXT: sarq $63, %rbx
+; SSE41-NEXT: movq %rsi, %r10
+; SSE41-NEXT: imulq %rbx, %r10
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: mulq %rbx
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: movq %rdx, %r10
-; SSE41-NEXT: subq %r11, %r10
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT: andq %rax, %rcx
-; SSE41-NEXT: subq %rcx, %r10
-; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: movq %rax, %r13
-; SSE41-NEXT: sarq $63, %r11
-; SSE41-NEXT: movq %r11, %rcx
-; SSE41-NEXT: andq %rbp, %rcx
-; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: addq %r10, %rdx
+; SSE41-NEXT: imulq %rbp, %rbx
+; SSE41-NEXT: addq %rdx, %rbx
+; SSE41-NEXT: movq %rbp, %r10
+; SSE41-NEXT: sarq $63, %r10
+; SSE41-NEXT: movq %r10, %r14
+; SSE41-NEXT: imulq %rcx, %r14
+; SSE41-NEXT: movq %r10, %rax
; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: movq %rdx, %r14
-; SSE41-NEXT: subq %rcx, %r14
-; SSE41-NEXT: andq %r8, %r11
-; SSE41-NEXT: subq %r11, %r14
-; SSE41-NEXT: addq %r9, %rbx
-; SSE41-NEXT: adcq %r10, %r14
+; SSE41-NEXT: movq %rax, %r11
+; SSE41-NEXT: addq %r14, %rdx
+; SSE41-NEXT: imulq %r8, %r10
+; SSE41-NEXT: addq %rdx, %r10
+; SSE41-NEXT: addq %r9, %r11
+; SSE41-NEXT: adcq %rbx, %r10
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %r9
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: movq %rbp, %rax
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: addq %r9, %r11
+; SSE41-NEXT: movq %rax, %r14
+; SSE41-NEXT: addq %r9, %r14
; SSE41-NEXT: adcq $0, %rsi
; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: mulq %r13
+; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %r11, %r9
+; SSE41-NEXT: addq %r14, %r9
; SSE41-NEXT: adcq %rsi, %r8
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %ecx
-; SSE41-NEXT: movq %rbp, %rax
-; SSE41-NEXT: mulq %r13
+; SSE41-NEXT: movzbl %al, %esi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rbp
; SSE41-NEXT: addq %r8, %rax
-; SSE41-NEXT: adcq %rcx, %rdx
-; SSE41-NEXT: addq %rbx, %rax
-; SSE41-NEXT: adcq %r14, %rdx
+; SSE41-NEXT: adcq %rsi, %rdx
+; SSE41-NEXT: addq %r11, %rax
+; SSE41-NEXT: adcq %r10, %rdx
; SSE41-NEXT: movq %r9, 24(%r12)
; SSE41-NEXT: sarq $63, %r9
; SSE41-NEXT: xorq %r9, %rdx
@@ -3703,7 +3685,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: negl %r15d
; SSE41-NEXT: movd %r15d, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %r10, 16(%r12)
+; SSE41-NEXT: movq %rbx, 16(%r12)
; SSE41-NEXT: movq %rdi, (%r12)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
@@ -3722,33 +3704,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %r8, %r14
-; AVX-NEXT: movq %rcx, %rbp
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rsi, %r11
; AVX-NEXT: movq %rdi, %r10
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT: movq %r11, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rbx, %r15
-; AVX-NEXT: andq %r14, %r15
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movq %r11, %r12
+; AVX-NEXT: sarq $63, %r12
+; AVX-NEXT: movq %r14, %rbx
+; AVX-NEXT: imulq %r12, %rbx
; AVX-NEXT: movq %r14, %rax
-; AVX-NEXT: mulq %rbx
+; AVX-NEXT: mulq %r12
; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: movq %rdx, %r12
-; AVX-NEXT: subq %r15, %r12
-; AVX-NEXT: andq %r9, %rbx
-; AVX-NEXT: subq %rbx, %r12
-; AVX-NEXT: movq %r9, %r13
-; AVX-NEXT: sarq $63, %r13
-; AVX-NEXT: movq %r13, %rcx
-; AVX-NEXT: andq %r11, %rcx
-; AVX-NEXT: movq %r13, %rax
+; AVX-NEXT: addq %rbx, %rdx
+; AVX-NEXT: imulq %r9, %r12
+; AVX-NEXT: addq %rdx, %r12
+; AVX-NEXT: movq %r9, %rbx
+; AVX-NEXT: sarq $63, %rbx
+; AVX-NEXT: movq %rbx, %r13
+; AVX-NEXT: imulq %r11, %r13
+; AVX-NEXT: movq %rbx, %rax
; AVX-NEXT: mulq %r10
; AVX-NEXT: movq %rax, %r15
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: subq %rcx, %rbx
-; AVX-NEXT: andq %r10, %r13
-; AVX-NEXT: subq %r13, %rbx
+; AVX-NEXT: addq %r13, %rdx
+; AVX-NEXT: imulq %r10, %rbx
+; AVX-NEXT: addq %rdx, %rbx
; AVX-NEXT: addq %rdi, %r15
; AVX-NEXT: adcq %r12, %rbx
; AVX-NEXT: movq %r10, %rax
@@ -3768,11 +3748,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: addq %r13, %r10
; AVX-NEXT: adcq %r14, %r12
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %ecx
+; AVX-NEXT: movzbl %al, %r14d
; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r9
; AVX-NEXT: addq %r12, %rax
-; AVX-NEXT: adcq %rcx, %rdx
+; AVX-NEXT: adcq %r14, %rdx
; AVX-NEXT: addq %r15, %rax
; AVX-NEXT: adcq %rbx, %rdx
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3783,56 +3763,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: xorl %r15d, %r15d
; AVX-NEXT: orq %rdx, %r10
; AVX-NEXT: setne %r15b
-; AVX-NEXT: movq %rbp, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rcx, %r11
-; AVX-NEXT: andq %rsi, %r11
+; AVX-NEXT: movq %rcx, %rbx
+; AVX-NEXT: sarq $63, %rbx
+; AVX-NEXT: movq %rsi, %r10
+; AVX-NEXT: imulq %rbx, %r10
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: mulq %rcx
+; AVX-NEXT: mulq %rbx
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: movq %rdx, %r10
-; AVX-NEXT: subq %r11, %r10
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: andq %rax, %rcx
-; AVX-NEXT: subq %rcx, %r10
-; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: movq %rax, %r13
-; AVX-NEXT: sarq $63, %r11
-; AVX-NEXT: movq %r11, %rcx
-; AVX-NEXT: andq %rbp, %rcx
-; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: addq %r10, %rdx
+; AVX-NEXT: imulq %rbp, %rbx
+; AVX-NEXT: addq %rdx, %rbx
+; AVX-NEXT: movq %rbp, %r10
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: movq %r10, %r14
+; AVX-NEXT: imulq %rcx, %r14
+; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: movq %rdx, %r14
-; AVX-NEXT: subq %rcx, %r14
-; AVX-NEXT: andq %r8, %r11
-; AVX-NEXT: subq %r11, %r14
-; AVX-NEXT: addq %r9, %rbx
-; AVX-NEXT: adcq %r10, %r14
+; AVX-NEXT: movq %rax, %r11
+; AVX-NEXT: addq %r14, %rdx
+; AVX-NEXT: imulq %r8, %r10
+; AVX-NEXT: addq %rdx, %r10
+; AVX-NEXT: addq %r9, %r11
+; AVX-NEXT: adcq %rbx, %r10
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: movq %rbp, %rax
+; AVX-NEXT: movq %rax, %rbx
+; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: addq %r9, %r11
+; AVX-NEXT: movq %rax, %r14
+; AVX-NEXT: addq %r9, %r14
; AVX-NEXT: adcq $0, %rsi
; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %r13
+; AVX-NEXT: mulq %rbp
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %r11, %r9
+; AVX-NEXT: addq %r14, %r9
; AVX-NEXT: adcq %rsi, %r8
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %ecx
-; AVX-NEXT: movq %rbp, %rax
-; AVX-NEXT: mulq %r13
+; AVX-NEXT: movzbl %al, %esi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rbp
; AVX-NEXT: addq %r8, %rax
-; AVX-NEXT: adcq %rcx, %rdx
-; AVX-NEXT: addq %rbx, %rax
-; AVX-NEXT: adcq %r14, %rdx
+; AVX-NEXT: adcq %rsi, %rdx
+; AVX-NEXT: addq %r11, %rax
+; AVX-NEXT: adcq %r10, %rdx
; AVX-NEXT: movq %r9, 24(%r12)
; AVX-NEXT: sarq $63, %r9
; AVX-NEXT: xorq %r9, %rdx
@@ -3844,7 +3820,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: negl %r15d
; AVX-NEXT: vmovd %r15d, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %r10, 16(%r12)
+; AVX-NEXT: movq %rbx, 16(%r12)
; AVX-NEXT: movq %rdi, (%r12)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
@@ -3862,35 +3838,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT: movq %r9, %rbp
; AVX512F-NEXT: movq %rcx, %r11
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rsi, %rbp
-; AVX512F-NEXT: movq %rdi, %r9
+; AVX512F-NEXT: movq %rsi, %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: movq %rcx, %rbx
-; AVX512F-NEXT: sarq $63, %rbx
-; AVX512F-NEXT: movq %rbx, %r14
-; AVX512F-NEXT: andq %r15, %r14
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: movq %rcx, %r12
+; AVX512F-NEXT: sarq $63, %r12
+; AVX512F-NEXT: movq %r15, %rbx
+; AVX512F-NEXT: imulq %r12, %rbx
; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %rbx
+; AVX512F-NEXT: mulq %r12
; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: movq %rdx, %r12
-; AVX512F-NEXT: subq %r14, %r12
-; AVX512F-NEXT: andq %rdi, %rbx
-; AVX512F-NEXT: subq %rbx, %r12
-; AVX512F-NEXT: movq %rdi, %r13
-; AVX512F-NEXT: sarq $63, %r13
-; AVX512F-NEXT: movq %r13, %rsi
-; AVX512F-NEXT: andq %r11, %rsi
-; AVX512F-NEXT: movq %r13, %rax
+; AVX512F-NEXT: addq %rbx, %rdx
+; AVX512F-NEXT: imulq %rsi, %r12
+; AVX512F-NEXT: addq %rdx, %r12
+; AVX512F-NEXT: movq %rsi, %rbx
+; AVX512F-NEXT: sarq $63, %rbx
+; AVX512F-NEXT: movq %rbx, %r13
+; AVX512F-NEXT: imulq %r11, %r13
+; AVX512F-NEXT: movq %rbx, %rax
; AVX512F-NEXT: mulq %r10
; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: movq %rdx, %rbx
-; AVX512F-NEXT: subq %rsi, %rbx
-; AVX512F-NEXT: andq %r10, %r13
-; AVX512F-NEXT: subq %r13, %rbx
+; AVX512F-NEXT: addq %r13, %rdx
+; AVX512F-NEXT: imulq %r10, %rbx
+; AVX512F-NEXT: addq %rdx, %rbx
; AVX512F-NEXT: addq %rcx, %r14
; AVX512F-NEXT: adcq %r12, %rbx
; AVX512F-NEXT: movq %r10, %rax
@@ -3904,78 +3877,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: addq %r12, %r13
; AVX512F-NEXT: adcq $0, %r15
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %rdi
+; AVX512F-NEXT: mulq %rsi
; AVX512F-NEXT: movq %rdx, %r12
; AVX512F-NEXT: movq %rax, %r10
; AVX512F-NEXT: addq %r13, %r10
; AVX512F-NEXT: adcq %r15, %r12
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %esi
+; AVX512F-NEXT: movzbl %al, %r15d
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rdi
+; AVX512F-NEXT: mulq %rsi
; AVX512F-NEXT: addq %r12, %rax
-; AVX512F-NEXT: adcq %rsi, %rdx
+; AVX512F-NEXT: adcq %r15, %rdx
; AVX512F-NEXT: addq %r14, %rax
; AVX512F-NEXT: adcq %rbx, %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512F-NEXT: movq %r10, 24(%r13)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT: movq %r10, 24(%r12)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %rbp, %rsi
+; AVX512F-NEXT: movq %r9, %rsi
; AVX512F-NEXT: sarq $63, %rsi
-; AVX512F-NEXT: movq %rsi, %rdi
-; AVX512F-NEXT: andq %r8, %rdi
+; AVX512F-NEXT: movq %r8, %r11
+; AVX512F-NEXT: imulq %rsi, %r11
; AVX512F-NEXT: movq %r8, %rax
; AVX512F-NEXT: mulq %rsi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: movq %rdx, %r11
-; AVX512F-NEXT: subq %rdi, %r11
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: andq %rax, %rsi
-; AVX512F-NEXT: subq %rsi, %r11
+; AVX512F-NEXT: addq %r11, %rdx
+; AVX512F-NEXT: imulq %rbp, %rsi
+; AVX512F-NEXT: addq %rdx, %rsi
+; AVX512F-NEXT: movq %rbp, %r11
+; AVX512F-NEXT: sarq $63, %r11
+; AVX512F-NEXT: movq %r11, %r14
+; AVX512F-NEXT: imulq %r9, %r14
+; AVX512F-NEXT: movq %r11, %rax
+; AVX512F-NEXT: mulq %rdi
; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: movq %rax, %r12
-; AVX512F-NEXT: sarq $63, %rbx
-; AVX512F-NEXT: movq %rbx, %rsi
-; AVX512F-NEXT: andq %rbp, %rsi
-; AVX512F-NEXT: movq %rbx, %rax
-; AVX512F-NEXT: mulq %r9
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: movq %rdx, %r15
-; AVX512F-NEXT: subq %rsi, %r15
-; AVX512F-NEXT: andq %r9, %rbx
-; AVX512F-NEXT: subq %rbx, %r15
-; AVX512F-NEXT: addq %r10, %r14
-; AVX512F-NEXT: adcq %r11, %r15
-; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: addq %r14, %rdx
+; AVX512F-NEXT: imulq %rdi, %r11
+; AVX512F-NEXT: addq %rdx, %r11
+; AVX512F-NEXT: addq %r10, %rbx
+; AVX512F-NEXT: adcq %rsi, %r11
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: movq %rbp, %rax
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq %r9, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: addq %r10, %rbx
+; AVX512F-NEXT: movq %rax, %r15
+; AVX512F-NEXT: addq %r10, %r15
; AVX512F-NEXT: adcq $0, %r8
-; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %r12
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: mulq %rbp
; AVX512F-NEXT: movq %rdx, %rdi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %rbx, %r10
+; AVX512F-NEXT: addq %r15, %r10
; AVX512F-NEXT: adcq %r8, %rdi
; AVX512F-NEXT: setb %al
; AVX512F-NEXT: movzbl %al, %esi
-; AVX512F-NEXT: movq %rbp, %rax
-; AVX512F-NEXT: mulq %r12
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %rbp
; AVX512F-NEXT: addq %rdi, %rax
; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: addq %r14, %rax
-; AVX512F-NEXT: adcq %r15, %rdx
-; AVX512F-NEXT: movq %r10, 8(%r13)
+; AVX512F-NEXT: addq %rbx, %rax
+; AVX512F-NEXT: adcq %r11, %rdx
+; AVX512F-NEXT: movq %r10, 8(%r12)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
@@ -3987,8 +3956,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq %rcx, 16(%r13)
-; AVX512F-NEXT: movq %r11, (%r13)
+; AVX512F-NEXT: movq %rcx, 16(%r12)
+; AVX512F-NEXT: movq %r14, (%r12)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
@@ -4005,35 +3974,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: movq %r9, %rbp
; AVX512BW-NEXT: movq %rcx, %r11
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rsi, %rbp
-; AVX512BW-NEXT: movq %rdi, %r9
+; AVX512BW-NEXT: movq %rsi, %r9
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-NEXT: movq %rcx, %rbx
-; AVX512BW-NEXT: sarq $63, %rbx
-; AVX512BW-NEXT: movq %rbx, %r14
-; AVX512BW-NEXT: andq %r15, %r14
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512BW-NEXT: movq %rcx, %r12
+; AVX512BW-NEXT: sarq $63, %r12
+; AVX512BW-NEXT: movq %r15, %rbx
+; AVX512BW-NEXT: imulq %r12, %rbx
; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %rbx
+; AVX512BW-NEXT: mulq %r12
; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: movq %rdx, %r12
-; AVX512BW-NEXT: subq %r14, %r12
-; AVX512BW-NEXT: andq %rdi, %rbx
-; AVX512BW-NEXT: subq %rbx, %r12
-; AVX512BW-NEXT: movq %rdi, %r13
-; AVX512BW-NEXT: sarq $63, %r13
-; AVX512BW-NEXT: movq %r13, %rsi
-; AVX512BW-NEXT: andq %r11, %rsi
-; AVX512BW-NEXT: movq %r13, %rax
+; AVX512BW-NEXT: addq %rbx, %rdx
+; AVX512BW-NEXT: imulq %rsi, %r12
+; AVX512BW-NEXT: addq %rdx, %r12
+; AVX512BW-NEXT: movq %rsi, %rbx
+; AVX512BW-NEXT: sarq $63, %rbx
+; AVX512BW-NEXT: movq %rbx, %r13
+; AVX512BW-NEXT: imulq %r11, %r13
+; AVX512BW-NEXT: movq %rbx, %rax
; AVX512BW-NEXT: mulq %r10
; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: movq %rdx, %rbx
-; AVX512BW-NEXT: subq %rsi, %rbx
-; AVX512BW-NEXT: andq %r10, %r13
-; AVX512BW-NEXT: subq %r13, %rbx
+; AVX512BW-NEXT: addq %r13, %rdx
+; AVX512BW-NEXT: imulq %r10, %rbx
+; AVX512BW-NEXT: addq %rdx, %rbx
; AVX512BW-NEXT: addq %rcx, %r14
; AVX512BW-NEXT: adcq %r12, %rbx
; AVX512BW-NEXT: movq %r10, %rax
@@ -4047,78 +4013,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: addq %r12, %r13
; AVX512BW-NEXT: adcq $0, %r15
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %rdi
+; AVX512BW-NEXT: mulq %rsi
; AVX512BW-NEXT: movq %rdx, %r12
; AVX512BW-NEXT: movq %rax, %r10
; AVX512BW-NEXT: addq %r13, %r10
; AVX512BW-NEXT: adcq %r15, %r12
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %esi
+; AVX512BW-NEXT: movzbl %al, %r15d
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rdi
+; AVX512BW-NEXT: mulq %rsi
; AVX512BW-NEXT: addq %r12, %rax
-; AVX512BW-NEXT: adcq %rsi, %rdx
+; AVX512BW-NEXT: adcq %r15, %rdx
; AVX512BW-NEXT: addq %r14, %rax
; AVX512BW-NEXT: adcq %rbx, %rdx
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512BW-NEXT: movq %r10, 24(%r13)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT: movq %r10, 24(%r12)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %rbp, %rsi
+; AVX512BW-NEXT: movq %r9, %rsi
; AVX512BW-NEXT: sarq $63, %rsi
-; AVX512BW-NEXT: movq %rsi, %rdi
-; AVX512BW-NEXT: andq %r8, %rdi
+; AVX512BW-NEXT: movq %r8, %r11
+; AVX512BW-NEXT: imulq %rsi, %r11
; AVX512BW-NEXT: movq %r8, %rax
; AVX512BW-NEXT: mulq %rsi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: movq %rdx, %r11
-; AVX512BW-NEXT: subq %rdi, %r11
-; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT: andq %rax, %rsi
-; AVX512BW-NEXT: subq %rsi, %r11
+; AVX512BW-NEXT: addq %r11, %rdx
+; AVX512BW-NEXT: imulq %rbp, %rsi
+; AVX512BW-NEXT: addq %rdx, %rsi
+; AVX512BW-NEXT: movq %rbp, %r11
+; AVX512BW-NEXT: sarq $63, %r11
+; AVX512BW-NEXT: movq %r11, %r14
+; AVX512BW-NEXT: imulq %r9, %r14
+; AVX512BW-NEXT: movq %r11, %rax
+; AVX512BW-NEXT: mulq %rdi
; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: movq %rax, %r12
-; AVX512BW-NEXT: sarq $63, %rbx
-; AVX512BW-NEXT: movq %rbx, %rsi
-; AVX512BW-NEXT: andq %rbp, %rsi
-; AVX512BW-NEXT: movq %rbx, %rax
-; AVX512BW-NEXT: mulq %r9
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: movq %rdx, %r15
-; AVX512BW-NEXT: subq %rsi, %r15
-; AVX512BW-NEXT: andq %r9, %rbx
-; AVX512BW-NEXT: subq %rbx, %r15
-; AVX512BW-NEXT: addq %r10, %r14
-; AVX512BW-NEXT: adcq %r11, %r15
-; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: addq %r14, %rdx
+; AVX512BW-NEXT: imulq %rdi, %r11
+; AVX512BW-NEXT: addq %rdx, %r11
+; AVX512BW-NEXT: addq %r10, %rbx
+; AVX512BW-NEXT: adcq %rsi, %r11
+; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rax, %r11
-; AVX512BW-NEXT: movq %rbp, %rax
+; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq %r9, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r8
-; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: addq %r10, %rbx
+; AVX512BW-NEXT: movq %rax, %r15
+; AVX512BW-NEXT: addq %r10, %r15
; AVX512BW-NEXT: adcq $0, %r8
-; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %r12
+; AVX512BW-NEXT: movq %rdi, %rax
+; AVX512BW-NEXT: mulq %rbp
; AVX512BW-NEXT: movq %rdx, %rdi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %rbx, %r10
+; AVX512BW-NEXT: addq %r15, %r10
; AVX512BW-NEXT: adcq %r8, %rdi
; AVX512BW-NEXT: setb %al
; AVX512BW-NEXT: movzbl %al, %esi
-; AVX512BW-NEXT: movq %rbp, %rax
-; AVX512BW-NEXT: mulq %r12
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %rbp
; AVX512BW-NEXT: addq %rdi, %rax
; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: addq %r14, %rax
-; AVX512BW-NEXT: adcq %r15, %rdx
-; AVX512BW-NEXT: movq %r10, 8(%r13)
+; AVX512BW-NEXT: addq %rbx, %rax
+; AVX512BW-NEXT: adcq %r11, %rdx
+; AVX512BW-NEXT: movq %r10, 8(%r12)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
@@ -4130,8 +4092,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq %rcx, 16(%r13)
-; AVX512BW-NEXT: movq %r11, (%r13)
+; AVX512BW-NEXT: movq %rcx, 16(%r12)
+; AVX512BW-NEXT: movq %r14, (%r12)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 508b0d7fe0f2b..4adc80b3b8bd6 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -215,36 +215,35 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: subl $8, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: imull %edi, %esi
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %esi, %edx
+; WIN32-NEXT: movl %ebp, %esi
+; WIN32-NEXT: imull %ebp, %edi
+; WIN32-NEXT: addl %edx, %edi
; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edi
-; WIN32-NEXT: andl %eax, %edi
-; WIN32-NEXT: mull %esi
+; WIN32-NEXT: movl %esi, %ebp
+; WIN32-NEXT: imull %ecx, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: addl %ebp, %edx
+; WIN32-NEXT: imull %ecx, %esi
+; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: addl %ebx, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: subl %edi, %ecx
-; WIN32-NEXT: andl %ebp, %esi
-; WIN32-NEXT: subl %esi, %ecx
-; WIN32-NEXT: sarl $31, %ebp
-; WIN32-NEXT: movl %ebp, %edi
-; WIN32-NEXT: andl %ebx, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: subl %edi, %esi
-; WIN32-NEXT: andl %ebx, %ebp
-; WIN32-NEXT: subl %ebp, %esi
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: adcl %edi, %esi
+; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: movl %ecx, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebx
@@ -263,7 +262,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: adcl %esi, %edx
; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
@@ -272,7 +271,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl %ebp, 4(%eax)
-; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: addl $8, %esp
@@ -574,52 +573,49 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: andl %eax, %edi
+; WIN32-NEXT: movl %ebp, %edi
+; WIN32-NEXT: imull %ecx, %edi
+; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: subl %edi, %esi
-; WIN32-NEXT: andl %ebx, %ecx
-; WIN32-NEXT: subl %ecx, %esi
-; WIN32-NEXT: movl %ebx, %ecx
-; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: andl %ebp, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: subl %edi, %ebx
-; WIN32-NEXT: movl %ebp, %edi
-; WIN32-NEXT: andl %ebp, %ecx
-; WIN32-NEXT: subl %ecx, %ebx
+; WIN32-NEXT: addl %edi, %edx
+; WIN32-NEXT: imull %ebx, %ecx
+; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: imull %esi, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: addl %edi, %edx
+; WIN32-NEXT: movl %esi, %edi
+; WIN32-NEXT: imull %esi, %ebx
+; WIN32-NEXT: addl %edx, %ebx
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: adcl %ecx, %ebx
; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
-; WIN32-NEXT: adcl $0, %ebp
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %esi, %ebp
+; WIN32-NEXT: adcl $0, %ecx
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %edi
; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ecx, %esi
-; WIN32-NEXT: adcl %ebp, %edi
+; WIN32-NEXT: addl %ebp, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: adcl %ecx, %edi
; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
@@ -1003,32 +999,30 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %esi
-; WIN32-NEXT: movl %ecx, %ebp
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: imull %edi, %esi
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %esi, %edx
+; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: imull %ebx, %edi
+; WIN32-NEXT: addl %edx, %edi
; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edi
-; WIN32-NEXT: andl %eax, %edi
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: subl %edi, %ecx
-; WIN32-NEXT: andl %ebx, %esi
-; WIN32-NEXT: subl %esi, %ecx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %edi
-; WIN32-NEXT: andl %ebp, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: subl %edi, %esi
-; WIN32-NEXT: andl %ebp, %ebx
-; WIN32-NEXT: subl %ebx, %esi
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movl %esi, %ebx
+; WIN32-NEXT: imull %ecx, %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: addl %ebx, %edx
+; WIN32-NEXT: imull %ecx, %esi
+; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: addl %ebp, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: movl %ebp, %edi
+; WIN32-NEXT: adcl %edi, %esi
+; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: movl %ecx, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebx
@@ -1710,62 +1704,57 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $16, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %esi
-; WIN32-NEXT: movl 4(%eax), %eax
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %edi, %ecx
-; WIN32-NEXT: andl %eax, %ecx
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: movl 4(%eax), %ebp
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: imull %ebp, %ecx
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: subl %ecx, %ebp
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: addl %ecx, %edx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: andl %esi, %edi
-; WIN32-NEXT: subl %edi, %ebp
-; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: imull %esi, %ebx
+; WIN32-NEXT: addl %edx, %ebx
+; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ecx, %ebx
-; WIN32-NEXT: andl %eax, %ebx
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: imull %ecx, %edi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: subl %ebx, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: andl %edx, %ecx
-; WIN32-NEXT: subl %ecx, %edi
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %edi, %edx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: addl %edx, %ecx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ebp, %edi
+; WIN32-NEXT: adcl %ebx, %ecx
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull %edi
; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: addl %ebx, %edi
; WIN32-NEXT: adcl $0, %ebp
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ebx, %esi
-; WIN32-NEXT: adcl %ebp, %ecx
-; WIN32-NEXT: setb %bl
+; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: adcl %ebp, %ebx
+; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ecx, %eax
-; WIN32-NEXT: movzbl %bl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %ebx, %eax
+; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
; WIN32-NEXT: adcl %edi, %edx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %ecx, %edx
; WIN32-NEXT: movl %esi, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
@@ -1773,7 +1762,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl %esi, 4(%eax)
-; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: addl $16, %esp
@@ -1821,35 +1810,35 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $12, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %ebp
; WIN32-NEXT: movl 4(%eax), %ebx
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %edi
-; WIN32-NEXT: andl %ebp, %edi
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %ebp, %esi
+; WIN32-NEXT: imull %edi, %esi
; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %esi
+; WIN32-NEXT: mull %edi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: subl %edi, %ecx
+; WIN32-NEXT: addl %esi, %edx
+; WIN32-NEXT: movl %ebx, %esi
; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: andl %ebx, %esi
-; WIN32-NEXT: subl %esi, %ecx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %edi
-; WIN32-NEXT: andl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: subl %edi, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: andl %edx, %ebx
-; WIN32-NEXT: subl %ebx, %esi
+; WIN32-NEXT: imull %ebx, %edi
+; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: movl %esi, %ebx
+; WIN32-NEXT: imull %ecx, %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: addl %ebx, %edx
+; WIN32-NEXT: imull %ecx, %esi
+; WIN32-NEXT: addl %edx, %esi
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %edx, %eax
+; WIN32-NEXT: adcl %edi, %esi
+; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %edx, %edi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
More information about the llvm-commits
mailing list