[llvm] e8b3ffa - [DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 22 21:58:33 PDT 2022
Author: Craig Topper
Date: 2022-10-22T21:51:45-07:00
New Revision: e8b3ffa532b8ebac5dcdf17bb91b47817382c14d
URL: https://github.com/llvm/llvm-project/commit/e8b3ffa532b8ebac5dcdf17bb91b47817382c14d
DIFF: https://github.com/llvm/llvm-project/commit/e8b3ffa532b8ebac5dcdf17bb91b47817382c14d.diff
LOG: [DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
(sra X, BW-1) is either 0 or -1. So the multiply is a conditional
negate of Y.
This pattern shows up when type legalizing wide multiplies involving
a sign extended value.
Fixes PR57549.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D133399
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
llvm/test/CodeGen/AMDGPU/mad_64_32.ll
llvm/test/CodeGen/PowerPC/pr45448.ll
llvm/test/CodeGen/RISCV/mul.ll
llvm/test/CodeGen/RISCV/xaluo.ll
llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
llvm/test/CodeGen/X86/extmul128.ll
llvm/test/CodeGen/X86/muloti.ll
llvm/test/CodeGen/X86/smul_fix_sat.ll
llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
llvm/test/CodeGen/X86/vec_smulo.ll
llvm/test/CodeGen/X86/xmulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a4c04b525bdf0..44ce4947a6ab8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3957,6 +3957,30 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) {
return SDValue();
}
+// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
+static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1,
+ SelectionDAG &DAG) {
+ if (N0.getOpcode() != ISD::SRA)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // TODO: Use computeNumSignBits() == BitWidth?
+ unsigned BitWidth = VT.getScalarSizeInBits();
+ ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1));
+ if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1))
+ return SDValue();
+
+ // If optimizing for minsize, we don't want to increase the number of
+ // instructions.
+ if (DAG.getMachineFunction().getFunction().hasMinSize())
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1);
+ return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And);
+}
+
SDValue DAGCombiner::visitMUL(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -4167,6 +4191,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
}
}
+ if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG))
+ return V;
+ if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG))
+ return V;
+
// reassociate mul
if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
return RMUL;
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index e955014371525..c01ec69629f30 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -39,21 +39,24 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
; AARCH: // %bb.0: // %Entry
; AARCH-NEXT: asr x9, x1, #63
; AARCH-NEXT: asr x10, x3, #63
+; AARCH-NEXT: and x11, x9, x2
+; AARCH-NEXT: and x14, x10, x1
+; AARCH-NEXT: umulh x12, x2, x9
+; AARCH-NEXT: and x9, x9, x3
+; AARCH-NEXT: umulh x13, x10, x0
+; AARCH-NEXT: and x10, x10, x0
+; AARCH-NEXT: sub x12, x12, x11
+; AARCH-NEXT: neg x11, x11
+; AARCH-NEXT: sub x13, x13, x14
+; AARCH-NEXT: sub x9, x12, x9
+; AARCH-NEXT: sub x12, x13, x10
+; AARCH-NEXT: neg x10, x10
; AARCH-NEXT: umulh x14, x0, x2
-; AARCH-NEXT: mov x8, x1
-; AARCH-NEXT: mul x11, x2, x9
-; AARCH-NEXT: str wzr, [x4]
-; AARCH-NEXT: umulh x12, x10, x0
-; AARCH-NEXT: umulh x13, x2, x9
-; AARCH-NEXT: madd x12, x10, x1, x12
-; AARCH-NEXT: add x13, x13, x11
-; AARCH-NEXT: mul x10, x10, x0
-; AARCH-NEXT: madd x9, x3, x9, x13
-; AARCH-NEXT: add x12, x12, x10
; AARCH-NEXT: adds x10, x10, x11
; AARCH-NEXT: mul x11, x1, x2
; AARCH-NEXT: adc x9, x12, x9
; AARCH-NEXT: umulh x13, x1, x2
+; AARCH-NEXT: mov x8, x1
; AARCH-NEXT: mul x12, x0, x3
; AARCH-NEXT: adds x11, x11, x14
; AARCH-NEXT: umulh x14, x0, x3
@@ -73,6 +76,7 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
; AARCH-NEXT: eor x9, x9, x11
; AARCH-NEXT: eor x10, x10, x11
; AARCH-NEXT: orr x9, x10, x9
+; AARCH-NEXT: str wzr, [x4]
; AARCH-NEXT: cmp x9, #0
; AARCH-NEXT: cset w9, ne
; AARCH-NEXT: tbz x8, #63, .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index f806149d0c395..fc65050be9f92 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -159,24 +159,28 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0
+; CI-NEXT: v_ashrrev_i32_e32 v11, 31, v0
; CI-NEXT: v_mov_b32_e32 v8, 0
-; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
-; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1
-; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
-; CI-NEXT: v_mov_b32_e32 v7, v10
+; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8]
+; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v1
+; CI-NEXT: v_and_b32_e32 v14, v11, v1
+; CI-NEXT: v_mov_b32_e32 v1, v10
; CI-NEXT: v_mov_b32_e32 v10, v8
-; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
-; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
-; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9
-; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
-; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
-; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0
-; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v1, v8
+; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10]
+; CI-NEXT: v_and_b32_e32 v13, v11, v12
+; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14
+; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc
+; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10]
+; CI-NEXT: v_mov_b32_e32 v0, v8
+; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1]
+; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9
+; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc
+; CI-NEXT: v_mov_b32_e32 v1, v7
; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc
; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -184,60 +188,64 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0
-; SI-NEXT: v_mul_lo_u32 v11, v6, v1
-; SI-NEXT: v_mul_hi_u32 v12, v0, v1
; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; SI-NEXT: v_mul_hi_u32 v14, v6, v1
-; SI-NEXT: v_mul_lo_u32 v13, v0, v7
-; SI-NEXT: v_mul_hi_u32 v10, v0, v7
-; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12
-; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; SI-NEXT: v_mul_hi_u32 v8, v6, v7
-; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7
-; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; SI-NEXT: v_mul_hi_i32 v6, v1, v6
-; SI-NEXT: v_mul_hi_i32 v7, v7, v0
-; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
-; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc
-; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11
+; SI-NEXT: v_and_b32_e32 v9, v6, v1
+; SI-NEXT: v_and_b32_e32 v10, v7, v0
+; SI-NEXT: v_mul_lo_u32 v13, v6, v1
+; SI-NEXT: v_mul_hi_u32 v14, v0, v1
+; SI-NEXT: v_and_b32_e32 v8, v6, v7
+; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; SI-NEXT: v_mul_hi_u32 v10, v6, v7
+; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7
+; SI-NEXT: v_mul_hi_u32 v6, v6, v1
+; SI-NEXT: v_mul_hi_u32 v12, v0, v7
+; SI-NEXT: v_mul_lo_u32 v7, v0, v7
+; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc
+; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
+; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12
+; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
+; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6
; SI-NEXT: v_mul_lo_u32 v0, v0, v1
-; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10
-; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc
+; SI-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v9
+; SI-NEXT: v_subb_u32_e32 v8, vcc, v10, v8, vcc
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc
-; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc
+; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc
+; SI-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v8, v7
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
-; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v11
-; GFX9-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
-; GFX9-NEXT: v_mov_b32_e32 v12, v11
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
-; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
-; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v1, v10
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1
+; GFX9-NEXT: v_and_b32_e32 v6, v14, v1
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_mov_b32_e32 v10, v9
+; GFX9-NEXT: v_and_b32_e32 v7, v14, v15
+; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6
+; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11]
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NEXT: v_mov_b32_e32 v10, v13
+; GFX9-NEXT: v_mov_b32_e32 v13, v11
+; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13]
+; GFX9-NEXT: v_mov_b32_e32 v12, v1
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11]
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -246,27 +254,30 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
; GFX11-NEXT: v_mov_b32_e32 v8, 0
-; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0
-; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v16, 31, v0
+; GFX11-NEXT: v_ashrrev_i32_e32 v17, 31, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8]
; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
-; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
-; GFX11-NEXT: v_mov_b32_e32 v8, v12
+; GFX11-NEXT: v_and_b32_e32 v8, v16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10]
+; GFX11-NEXT: v_and_b32_e32 v9, v16, v17
+; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v1, v12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
-; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
+; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9]
+; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1
; GFX11-NEXT: v_mov_b32_e32 v7, v11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
-; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13]
+; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 0f8014df8adca..c3337c78a4770 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -25,7 +25,8 @@ define hidden void @julia_tryparse_internal_45896() #0 {
; CHECK-NEXT: rldic r5, r5, 4, 32
; CHECK-NEXT: crnot 4*cr5+lt, eq
; CHECK-NEXT: mulhdu r3, r3, r5
-; CHECK-NEXT: maddld r6, r4, r5, r3
+; CHECK-NEXT: and r6, r4, r5
+; CHECK-NEXT: sub r6, r3, r6
; CHECK-NEXT: cmpld cr1, r6, r3
; CHECK-NEXT: mulhdu. r3, r4, r5
; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_10
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 3923c4340d30e..986e799428e57 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1480,18 +1480,18 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
; RV32IM-NEXT: add a5, a6, a2
; RV32IM-NEXT: mul a7, a1, a3
; RV32IM-NEXT: add t0, a7, a5
-; RV32IM-NEXT: mul t1, a4, a0
-; RV32IM-NEXT: add a2, t0, t1
+; RV32IM-NEXT: and t1, a4, a0
+; RV32IM-NEXT: sub a2, t0, t1
; RV32IM-NEXT: sltu t2, a2, t0
; RV32IM-NEXT: sltu a7, t0, a7
; RV32IM-NEXT: sltu a5, a5, a6
; RV32IM-NEXT: mulhu a3, a1, a3
; RV32IM-NEXT: add a3, a3, a5
; RV32IM-NEXT: add a3, a3, a7
-; RV32IM-NEXT: mul a1, a4, a1
+; RV32IM-NEXT: and a1, a4, a1
; RV32IM-NEXT: mulhu a0, a4, a0
-; RV32IM-NEXT: add a0, a0, a1
-; RV32IM-NEXT: add a0, a0, t1
+; RV32IM-NEXT: sub a0, a0, a1
+; RV32IM-NEXT: sub a0, a0, t1
; RV32IM-NEXT: add a0, a3, a0
; RV32IM-NEXT: add a1, a0, t2
; RV32IM-NEXT: mv a0, a2
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index f6963fd674d3e..f3391b2816495 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -961,8 +961,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: .cfi_offset s1, -8
+; RV32-NEXT: .cfi_offset s2, -12
; RV32-NEXT: mulhu a5, a0, a2
; RV32-NEXT: mul a6, a1, a2
; RV32-NEXT: add a5, a6, a5
@@ -978,33 +980,34 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: mul t0, a1, a3
; RV32-NEXT: add t1, t0, a7
; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: mul t3, a2, t2
+; RV32-NEXT: and t3, t2, a2
; RV32-NEXT: srai t4, a3, 31
-; RV32-NEXT: mul t5, t4, a0
-; RV32-NEXT: add t6, t5, t3
-; RV32-NEXT: add s0, t1, t6
-; RV32-NEXT: sltu s1, s0, t1
+; RV32-NEXT: and t5, t4, a0
+; RV32-NEXT: neg t6, t5
+; RV32-NEXT: sub s0, t6, t3
+; RV32-NEXT: add s1, t1, s0
+; RV32-NEXT: sltu s2, s1, t1
; RV32-NEXT: sltu t0, t1, t0
; RV32-NEXT: sltu a6, a7, a6
; RV32-NEXT: mulhu a7, a1, a3
; RV32-NEXT: add a6, a7, a6
; RV32-NEXT: add a6, a6, t0
; RV32-NEXT: mulhu a7, a2, t2
-; RV32-NEXT: add a7, a7, t3
-; RV32-NEXT: mul a3, a3, t2
-; RV32-NEXT: add a3, a7, a3
-; RV32-NEXT: mul a1, t4, a1
+; RV32-NEXT: sub a7, a7, t3
+; RV32-NEXT: and a3, t2, a3
+; RV32-NEXT: sub a3, a7, a3
+; RV32-NEXT: and a1, t4, a1
; RV32-NEXT: mulhu a7, t4, a0
-; RV32-NEXT: add a1, a7, a1
-; RV32-NEXT: add a1, a1, t5
+; RV32-NEXT: sub a1, a7, a1
+; RV32-NEXT: sub a1, a1, t5
; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: sltu a3, t6, t5
+; RV32-NEXT: sltu a3, s0, t6
; RV32-NEXT: add a1, a1, a3
; RV32-NEXT: add a1, a6, a1
-; RV32-NEXT: add a1, a1, s1
+; RV32-NEXT: add a1, a1, s2
; RV32-NEXT: srai a3, a5, 31
; RV32-NEXT: xor a1, a1, a3
-; RV32-NEXT: xor a3, s0, a3
+; RV32-NEXT: xor a3, s1, a3
; RV32-NEXT: or a1, a3, a1
; RV32-NEXT: snez a1, a1
; RV32-NEXT: mul a0, a0, a2
@@ -1013,6 +1016,7 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32-NEXT: mv a0, a1
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -1032,8 +1036,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
; RV32ZBA-NEXT: .cfi_offset s1, -8
+; RV32ZBA-NEXT: .cfi_offset s2, -12
; RV32ZBA-NEXT: mulhu a5, a0, a2
; RV32ZBA-NEXT: mul a6, a1, a2
; RV32ZBA-NEXT: add a5, a6, a5
@@ -1049,33 +1055,34 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: mul t0, a1, a3
; RV32ZBA-NEXT: add t1, t0, a7
; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: mul t3, a2, t2
+; RV32ZBA-NEXT: and t3, t2, a2
; RV32ZBA-NEXT: srai t4, a3, 31
-; RV32ZBA-NEXT: mul t5, t4, a0
-; RV32ZBA-NEXT: add t6, t5, t3
-; RV32ZBA-NEXT: add s0, t1, t6
-; RV32ZBA-NEXT: sltu s1, s0, t1
+; RV32ZBA-NEXT: and t5, t4, a0
+; RV32ZBA-NEXT: neg t6, t5
+; RV32ZBA-NEXT: sub s0, t6, t3
+; RV32ZBA-NEXT: add s1, t1, s0
+; RV32ZBA-NEXT: sltu s2, s1, t1
; RV32ZBA-NEXT: sltu t0, t1, t0
; RV32ZBA-NEXT: sltu a6, a7, a6
; RV32ZBA-NEXT: mulhu a7, a1, a3
; RV32ZBA-NEXT: add a6, a7, a6
; RV32ZBA-NEXT: add a6, a6, t0
; RV32ZBA-NEXT: mulhu a7, a2, t2
-; RV32ZBA-NEXT: add a7, a7, t3
-; RV32ZBA-NEXT: mul a3, a3, t2
-; RV32ZBA-NEXT: add a3, a7, a3
-; RV32ZBA-NEXT: mul a1, t4, a1
+; RV32ZBA-NEXT: sub a7, a7, t3
+; RV32ZBA-NEXT: and a3, t2, a3
+; RV32ZBA-NEXT: sub a3, a7, a3
+; RV32ZBA-NEXT: and a1, t4, a1
; RV32ZBA-NEXT: mulhu a7, t4, a0
-; RV32ZBA-NEXT: add a1, a7, a1
-; RV32ZBA-NEXT: add a1, a1, t5
+; RV32ZBA-NEXT: sub a1, a7, a1
+; RV32ZBA-NEXT: sub a1, a1, t5
; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: sltu a3, t6, t5
+; RV32ZBA-NEXT: sltu a3, s0, t6
; RV32ZBA-NEXT: add a1, a1, a3
; RV32ZBA-NEXT: add a1, a6, a1
-; RV32ZBA-NEXT: add a1, a1, s1
+; RV32ZBA-NEXT: add a1, a1, s2
; RV32ZBA-NEXT: srai a3, a5, 31
; RV32ZBA-NEXT: xor a1, a1, a3
-; RV32ZBA-NEXT: xor a3, s0, a3
+; RV32ZBA-NEXT: xor a3, s1, a3
; RV32ZBA-NEXT: or a1, a3, a1
; RV32ZBA-NEXT: snez a1, a1
; RV32ZBA-NEXT: mul a0, a0, a2
@@ -1084,6 +1091,7 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
; RV32ZBA-NEXT: mv a0, a1
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -1115,8 +1123,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: srai a1, a1, 31
-; RV32-NEXT: mul a6, a1, a3
-; RV32-NEXT: add a6, a5, a6
+; RV32-NEXT: andi a6, a1, 13
+; RV32-NEXT: sub a6, a5, a6
; RV32-NEXT: srai a7, a4, 31
; RV32-NEXT: xor t0, a6, a7
; RV32-NEXT: sltu a5, a6, a5
@@ -1152,8 +1160,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: srai a1, a1, 31
-; RV32ZBA-NEXT: mul a6, a1, a3
-; RV32ZBA-NEXT: add a6, a5, a6
+; RV32ZBA-NEXT: andi a6, a1, 13
+; RV32ZBA-NEXT: sub a6, a5, a6
; RV32ZBA-NEXT: srai a7, a4, 31
; RV32ZBA-NEXT: xor t0, a6, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
@@ -2352,7 +2360,9 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -2368,33 +2378,34 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: mul t2, a2, t1
+; RV32-NEXT: and t2, t1, a2
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mul t4, t3, a0
-; RV32-NEXT: add t5, t4, t2
-; RV32-NEXT: add t6, t0, t5
-; RV32-NEXT: sltu s0, t6, t0
+; RV32-NEXT: and t4, t3, a0
+; RV32-NEXT: neg t5, t4
+; RV32-NEXT: sub t6, t5, t2
+; RV32-NEXT: add s0, t0, t6
+; RV32-NEXT: sltu s1, s0, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a6, a2, t1
-; RV32-NEXT: add a6, a6, t2
-; RV32-NEXT: mul a7, a3, t1
-; RV32-NEXT: add a6, a6, a7
-; RV32-NEXT: mul a7, t3, a1
+; RV32-NEXT: sub a6, a6, t2
+; RV32-NEXT: and a7, t1, a3
+; RV32-NEXT: sub a6, a6, a7
+; RV32-NEXT: and a7, t3, a1
; RV32-NEXT: mulhu t0, t3, a0
-; RV32-NEXT: add a7, t0, a7
-; RV32-NEXT: add a7, a7, t4
+; RV32-NEXT: sub a7, t0, a7
+; RV32-NEXT: sub a7, a7, t4
; RV32-NEXT: add a6, a7, a6
-; RV32-NEXT: sltu a7, t5, t4
+; RV32-NEXT: sltu a7, t6, t5
; RV32-NEXT: add a6, a6, a7
; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: add a5, a5, s0
+; RV32-NEXT: add a5, a5, s1
; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: xor a5, a5, a4
-; RV32-NEXT: xor a4, t6, a4
+; RV32-NEXT: xor a4, s0, a4
; RV32-NEXT: or a4, a4, a5
; RV32-NEXT: bnez a4, .LBB46_2
; RV32-NEXT: # %bb.1: # %entry
@@ -2402,6 +2413,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mv a1, a3
; RV32-NEXT: .LBB46_2: # %entry
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -2421,7 +2433,9 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
+; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -2437,33 +2451,34 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: mul t2, a2, t1
+; RV32ZBA-NEXT: and t2, t1, a2
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mul t4, t3, a0
-; RV32ZBA-NEXT: add t5, t4, t2
-; RV32ZBA-NEXT: add t6, t0, t5
-; RV32ZBA-NEXT: sltu s0, t6, t0
+; RV32ZBA-NEXT: and t4, t3, a0
+; RV32ZBA-NEXT: neg t5, t4
+; RV32ZBA-NEXT: sub t6, t5, t2
+; RV32ZBA-NEXT: add s0, t0, t6
+; RV32ZBA-NEXT: sltu s1, s0, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a6, a2, t1
-; RV32ZBA-NEXT: add a6, a6, t2
-; RV32ZBA-NEXT: mul a7, a3, t1
-; RV32ZBA-NEXT: add a6, a6, a7
-; RV32ZBA-NEXT: mul a7, t3, a1
+; RV32ZBA-NEXT: sub a6, a6, t2
+; RV32ZBA-NEXT: and a7, t1, a3
+; RV32ZBA-NEXT: sub a6, a6, a7
+; RV32ZBA-NEXT: and a7, t3, a1
; RV32ZBA-NEXT: mulhu t0, t3, a0
-; RV32ZBA-NEXT: add a7, t0, a7
-; RV32ZBA-NEXT: add a7, a7, t4
+; RV32ZBA-NEXT: sub a7, t0, a7
+; RV32ZBA-NEXT: sub a7, a7, t4
; RV32ZBA-NEXT: add a6, a7, a6
-; RV32ZBA-NEXT: sltu a7, t5, t4
+; RV32ZBA-NEXT: sltu a7, t6, t5
; RV32ZBA-NEXT: add a6, a6, a7
; RV32ZBA-NEXT: add a5, a5, a6
-; RV32ZBA-NEXT: add a5, a5, s0
+; RV32ZBA-NEXT: add a5, a5, s1
; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: xor a5, a5, a4
-; RV32ZBA-NEXT: xor a4, t6, a4
+; RV32ZBA-NEXT: xor a4, s0, a4
; RV32ZBA-NEXT: or a4, a4, a5
; RV32ZBA-NEXT: bnez a4, .LBB46_2
; RV32ZBA-NEXT: # %bb.1: # %entry
@@ -2471,6 +2486,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mv a1, a3
; RV32ZBA-NEXT: .LBB46_2: # %entry
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -2497,7 +2513,9 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -2513,36 +2531,38 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: mul t2, a2, t1
+; RV32-NEXT: and t2, t1, a2
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mul t4, t3, a0
-; RV32-NEXT: add t5, t4, t2
-; RV32-NEXT: add t6, t0, t5
-; RV32-NEXT: sltu s0, t6, t0
+; RV32-NEXT: and t4, t3, a0
+; RV32-NEXT: neg t5, t4
+; RV32-NEXT: sub t6, t5, t2
+; RV32-NEXT: add s0, t0, t6
+; RV32-NEXT: sltu s1, s0, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
-; RV32-NEXT: add a2, a2, t2
-; RV32-NEXT: mul a3, a3, t1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: mul a1, t3, a1
+; RV32-NEXT: sub a2, a2, t2
+; RV32-NEXT: and a3, t1, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: and a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, a0, t4
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: sub a0, a0, t4
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: sltu a1, t5, t4
+; RV32-NEXT: sltu a1, t6, t5
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, s0
+; RV32-NEXT: add a0, a0, s1
; RV32-NEXT: srai a1, a4, 31
; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: xor a1, t6, a1
+; RV32-NEXT: xor a1, s0, a1
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -2560,7 +2580,9 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
+; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -2576,36 +2598,38 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: mul t2, a2, t1
+; RV32ZBA-NEXT: and t2, t1, a2
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mul t4, t3, a0
-; RV32ZBA-NEXT: add t5, t4, t2
-; RV32ZBA-NEXT: add t6, t0, t5
-; RV32ZBA-NEXT: sltu s0, t6, t0
+; RV32ZBA-NEXT: and t4, t3, a0
+; RV32ZBA-NEXT: neg t5, t4
+; RV32ZBA-NEXT: sub t6, t5, t2
+; RV32ZBA-NEXT: add s0, t0, t6
+; RV32ZBA-NEXT: sltu s1, s0, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
-; RV32ZBA-NEXT: add a2, a2, t2
-; RV32ZBA-NEXT: mul a3, a3, t1
-; RV32ZBA-NEXT: add a2, a2, a3
-; RV32ZBA-NEXT: mul a1, t3, a1
+; RV32ZBA-NEXT: sub a2, a2, t2
+; RV32ZBA-NEXT: and a3, t1, a3
+; RV32ZBA-NEXT: sub a2, a2, a3
+; RV32ZBA-NEXT: and a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: add a0, a0, t4
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: sub a0, a0, t4
; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: sltu a1, t5, t4
+; RV32ZBA-NEXT: sltu a1, t6, t5
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
-; RV32ZBA-NEXT: add a0, a0, s0
+; RV32ZBA-NEXT: add a0, a0, s1
; RV32ZBA-NEXT: srai a1, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a1
-; RV32ZBA-NEXT: xor a1, t6, a1
+; RV32ZBA-NEXT: xor a1, s0, a1
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: seqz a0, a0
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -3453,7 +3477,9 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT: .cfi_offset s0, -4
+; RV32-NEXT: .cfi_offset s1, -8
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
; RV32-NEXT: add a4, a5, a4
@@ -3469,33 +3495,34 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: mul a7, a1, a3
; RV32-NEXT: add t0, a7, a6
; RV32-NEXT: srai t1, a1, 31
-; RV32-NEXT: mul t2, a2, t1
+; RV32-NEXT: and t2, t1, a2
; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mul t4, t3, a0
-; RV32-NEXT: add t5, t4, t2
-; RV32-NEXT: add t6, t0, t5
-; RV32-NEXT: sltu s0, t6, t0
+; RV32-NEXT: and t4, t3, a0
+; RV32-NEXT: neg t5, t4
+; RV32-NEXT: sub t6, t5, t2
+; RV32-NEXT: add s0, t0, t6
+; RV32-NEXT: sltu s1, s0, t0
; RV32-NEXT: sltu a7, t0, a7
; RV32-NEXT: sltu a5, a6, a5
; RV32-NEXT: mulhu a6, a1, a3
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: add a5, a5, a7
; RV32-NEXT: mulhu a2, a2, t1
-; RV32-NEXT: add a2, a2, t2
-; RV32-NEXT: mul a3, a3, t1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: mul a1, t3, a1
+; RV32-NEXT: sub a2, a2, t2
+; RV32-NEXT: and a3, t1, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: and a1, t3, a1
; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a0, a0, t4
+; RV32-NEXT: sub a0, a0, a1
+; RV32-NEXT: sub a0, a0, t4
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: sltu a1, t5, t4
+; RV32-NEXT: sltu a1, t6, t5
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: add a0, a5, a0
-; RV32-NEXT: add a0, a0, s0
+; RV32-NEXT: add a0, a0, s1
; RV32-NEXT: srai a1, a4, 31
; RV32-NEXT: xor a0, a0, a1
-; RV32-NEXT: xor a1, t6, a1
+; RV32-NEXT: xor a1, s0, a1
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: beqz a0, .LBB61_2
; RV32-NEXT: # %bb.1: # %overflow
@@ -3505,6 +3532,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32-NEXT: li a0, 1
; RV32-NEXT: .LBB61_3: # %overflow
; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
@@ -3526,7 +3554,9 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: addi sp, sp, -16
; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; RV32ZBA-NEXT: .cfi_offset s0, -4
+; RV32ZBA-NEXT: .cfi_offset s1, -8
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
; RV32ZBA-NEXT: add a4, a5, a4
@@ -3542,33 +3572,34 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: mul a7, a1, a3
; RV32ZBA-NEXT: add t0, a7, a6
; RV32ZBA-NEXT: srai t1, a1, 31
-; RV32ZBA-NEXT: mul t2, a2, t1
+; RV32ZBA-NEXT: and t2, t1, a2
; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mul t4, t3, a0
-; RV32ZBA-NEXT: add t5, t4, t2
-; RV32ZBA-NEXT: add t6, t0, t5
-; RV32ZBA-NEXT: sltu s0, t6, t0
+; RV32ZBA-NEXT: and t4, t3, a0
+; RV32ZBA-NEXT: neg t5, t4
+; RV32ZBA-NEXT: sub t6, t5, t2
+; RV32ZBA-NEXT: add s0, t0, t6
+; RV32ZBA-NEXT: sltu s1, s0, t0
; RV32ZBA-NEXT: sltu a7, t0, a7
; RV32ZBA-NEXT: sltu a5, a6, a5
; RV32ZBA-NEXT: mulhu a6, a1, a3
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: add a5, a5, a7
; RV32ZBA-NEXT: mulhu a2, a2, t1
-; RV32ZBA-NEXT: add a2, a2, t2
-; RV32ZBA-NEXT: mul a3, a3, t1
-; RV32ZBA-NEXT: add a2, a2, a3
-; RV32ZBA-NEXT: mul a1, t3, a1
+; RV32ZBA-NEXT: sub a2, a2, t2
+; RV32ZBA-NEXT: and a3, t1, a3
+; RV32ZBA-NEXT: sub a2, a2, a3
+; RV32ZBA-NEXT: and a1, t3, a1
; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: add a0, a0, t4
+; RV32ZBA-NEXT: sub a0, a0, a1
+; RV32ZBA-NEXT: sub a0, a0, t4
; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: sltu a1, t5, t4
+; RV32ZBA-NEXT: sltu a1, t6, t5
; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: add a0, a5, a0
-; RV32ZBA-NEXT: add a0, a0, s0
+; RV32ZBA-NEXT: add a0, a0, s1
; RV32ZBA-NEXT: srai a1, a4, 31
; RV32ZBA-NEXT: xor a0, a0, a1
-; RV32ZBA-NEXT: xor a1, t6, a1
+; RV32ZBA-NEXT: xor a1, s0, a1
; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: beqz a0, .LBB61_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
@@ -3578,6 +3609,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA-NEXT: li a0, 1
; RV32ZBA-NEXT: .LBB61_3: # %overflow
; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; RV32ZBA-NEXT: addi sp, sp, 16
; RV32ZBA-NEXT: ret
;
@@ -3625,8 +3657,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32-NEXT: add a6, a4, a6
; RV32-NEXT: sub t1, a6, a1
; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: mul t3, t2, a2
-; RV32-NEXT: sub t3, t3, a0
+; RV32-NEXT: andi t3, t2, -13
+; RV32-NEXT: sub t3, a5, t3
; RV32-NEXT: add t4, t1, t3
; RV32-NEXT: sltu t5, t4, t1
; RV32-NEXT: neg t6, a1
@@ -3687,8 +3719,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32ZBA-NEXT: add a6, a4, a6
; RV32ZBA-NEXT: sub t1, a6, a1
; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: mul t3, t2, a2
-; RV32ZBA-NEXT: sub t3, t3, a0
+; RV32ZBA-NEXT: andi t3, t2, -13
+; RV32ZBA-NEXT: sub t3, a5, t3
; RV32ZBA-NEXT: add t4, t1, t3
; RV32ZBA-NEXT: sltu t5, t4, t1
; RV32ZBA-NEXT: neg t6, a1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 217caeebe6335..9cb0ec4d98fb5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -38,22 +38,23 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_0246_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: umull r2, r5, r3, r0
+; CHECK-NEXT: umull r2, r4, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: mla r4, r1, r2, r12
-; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r2, r3, r2, r5
-; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: mla r1, r1, r0, r4
-; CHECK-NEXT: mla r0, r3, r0, r2
+; CHECK-NEXT: and.w r2, r1, r0, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r0, r1, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r3, r0, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r0, r0, r3, asr #31
+; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -67,22 +68,23 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_0246:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: asrs r4, r0, #31
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: umull r2, r5, r0, r3
+; CHECK-NEXT: umull r2, r4, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: mla r2, r0, r2, r12
-; CHECK-NEXT: mla r1, r4, r1, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r0, r0, r2, r5
-; CHECK-NEXT: mla r0, r4, r3, r0
+; CHECK-NEXT: and.w r2, r0, r1, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r1, r0, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r0, r3, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r0, r3, r0, asr #31
+; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -130,23 +132,24 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_1357_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vrev64.32 q1, q0
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r1, r0
-; CHECK-NEXT: umull r2, r5, r3, r0
+; CHECK-NEXT: umull r2, r4, r3, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: mla r4, r1, r2, r12
-; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r2, r3, r2, r5
-; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: mla r1, r1, r0, r4
-; CHECK-NEXT: mla r0, r3, r0, r2
+; CHECK-NEXT: and.w r2, r1, r0, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r0, r1, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r3, r0, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r0, r0, r3, asr #31
+; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -160,23 +163,24 @@ entry:
define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_1357:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vrev64.32 q1, q0
-; CHECK-NEXT: asrs r4, r0, #31
; CHECK-NEXT: vmov r1, s6
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r0, r1
-; CHECK-NEXT: umull r2, r5, r0, r3
+; CHECK-NEXT: umull r2, r4, r0, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: mla r2, r0, r2, r12
-; CHECK-NEXT: mla r1, r4, r1, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r0, r0, r2, r5
-; CHECK-NEXT: mla r0, r4, r3, r0
+; CHECK-NEXT: and.w r2, r0, r1, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r1, r0, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r0, r3, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r0, r3, r0, asr #31
+; CHECK-NEXT: subs r0, r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
%out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -230,36 +234,39 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_0213_ext0:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov.f32 s4, s1
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r2, r5, r3, r0
+; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r1, r0
+; CHECK-NEXT: umull r2, r4, r3, r0
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: asrs r2, r0, #31
-; CHECK-NEXT: mla r4, r1, r2, r12
-; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r5, r3, r2, r5
-; CHECK-NEXT: asrs r3, r3, #31
-; CHECK-NEXT: mla r1, r1, r0, r4
-; CHECK-NEXT: mla r3, r3, r0, r5
-; CHECK-NEXT: vmov q1[3], q1[1], r3, r1
+; CHECK-NEXT: and.w r2, r1, r0, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r0, r1, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r3, r0, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r3, r0, r3, asr #31
+; CHECK-NEXT: subs r2, r2, r3
+; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r3, r5, r1, r0
-; CHECK-NEXT: mla r5, r1, r2, r5
-; CHECK-NEXT: asrs r1, r1, #31
-; CHECK-NEXT: mla r12, r1, r0, r5
-; CHECK-NEXT: vmov r5, s0
-; CHECK-NEXT: umull r4, r1, r5, r0
-; CHECK-NEXT: mla r1, r5, r2, r1
-; CHECK-NEXT: asrs r2, r5, #31
+; CHECK-NEXT: and.w r2, r1, r0, asr #31
+; CHECK-NEXT: umull r3, r4, r1, r0
+; CHECK-NEXT: and.w r1, r0, r1, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: sub.w r12, r2, r1
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: umull r4, r1, r2, r0
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT: mla r0, r2, r0, r1
+; CHECK-NEXT: and.w r3, r2, r0, asr #31
+; CHECK-NEXT: and.w r0, r0, r2, asr #31
+; CHECK-NEXT: subs r1, r1, r3
+; CHECK-NEXT: subs r0, r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -273,36 +280,39 @@ entry:
define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
; CHECK-LABEL: sext32_ext0_0213:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r7, lr}
-; CHECK-NEXT: push {r4, r5, r7, lr}
-; CHECK-NEXT: vmov.f32 s4, s1
-; CHECK-NEXT: asrs r4, r0, #31
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.f32 s6, s3
-; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: vmov.f32 s4, s1
; CHECK-NEXT: vmov r1, s6
-; CHECK-NEXT: umull r2, r5, r0, r3
+; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: umull lr, r12, r0, r1
+; CHECK-NEXT: umull r2, r4, r0, r3
; CHECK-NEXT: vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT: asrs r2, r1, #31
-; CHECK-NEXT: mla r2, r0, r2, r12
-; CHECK-NEXT: mla r1, r4, r1, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r2, r0, r2, r5
-; CHECK-NEXT: mla r2, r4, r3, r2
+; CHECK-NEXT: and.w r2, r0, r1, asr #31
+; CHECK-NEXT: sub.w r2, r12, r2
+; CHECK-NEXT: and.w r1, r1, r0, asr #31
+; CHECK-NEXT: subs r1, r2, r1
+; CHECK-NEXT: and.w r2, r0, r3, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: and.w r3, r3, r0, asr #31
+; CHECK-NEXT: subs r2, r2, r3
; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
; CHECK-NEXT: vmov r1, s2
-; CHECK-NEXT: umull r2, r3, r0, r1
-; CHECK-NEXT: asrs r5, r1, #31
-; CHECK-NEXT: mla r3, r0, r5, r3
-; CHECK-NEXT: mla r12, r4, r1, r3
-; CHECK-NEXT: vmov r3, s0
-; CHECK-NEXT: umull r5, r1, r0, r3
-; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT: asrs r2, r3, #31
-; CHECK-NEXT: mla r0, r0, r2, r1
-; CHECK-NEXT: mla r0, r4, r3, r0
+; CHECK-NEXT: umull r3, r4, r0, r1
+; CHECK-NEXT: and.w r2, r0, r1, asr #31
+; CHECK-NEXT: and.w r1, r1, r0, asr #31
+; CHECK-NEXT: subs r2, r4, r2
+; CHECK-NEXT: sub.w r12, r2, r1
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: umull r4, r1, r0, r2
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT: and.w r3, r0, r2, asr #31
+; CHECK-NEXT: and.w r0, r2, r0, asr #31
+; CHECK-NEXT: subs r1, r1, r3
+; CHECK-NEXT: subs r0, r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT: pop {r4, r5, r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
%out1 = sext <4 x i32> %shuf1 to <4 x i64>
diff --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll
index a7f2959a23c2c..a2d8211888618 100644
--- a/llvm/test/CodeGen/X86/extmul128.ll
+++ b/llvm/test/CodeGen/X86/extmul128.ll
@@ -29,8 +29,8 @@ define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: sarq $63, %rsi
-; CHECK-NEXT: imulq %rdi, %rsi
-; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: andq %rdi, %rsi
+; CHECK-NEXT: subq %rsi, %rdx
; CHECK-NEXT: retq
%aa = zext i64 %a to i128
%bb = sext i64 %b to i128
@@ -45,6 +45,37 @@ define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: andq %rsi, %rcx
+; CHECK-NEXT: subq %rcx, %rdx
+; CHECK-NEXT: retq
+ %aa = sext i64 %a to i128
+ %bb = zext i64 %b to i128
+ %cc = mul i128 %aa, %bb
+ ret i128 %cc
+}
+
+define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
+; CHECK-LABEL: i64_zext_sext_i128_minsize:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: sarq $63, %rsi
+; CHECK-NEXT: imulq %rdi, %rsi
+; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: retq
+ %aa = zext i64 %a to i128
+ %bb = sext i64 %b to i128
+ %cc = mul i128 %aa, %bb
+ ret i128 %cc
+}
+
+define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize {
+; CHECK-LABEL: i64_sext_zext_i128_minsize:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: sarq $63, %rcx
+; CHECK-NEXT: mulq %rsi
; CHECK-NEXT: imulq %rsi, %rcx
; CHECK-NEXT: addq %rcx, %rdx
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 9a6cf0b065662..3733306f354a5 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -7,34 +7,39 @@
define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
; CHECK-LABEL: x:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r15
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushq %r14
; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: .cfi_offset %r14, -16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset %rbx, -32
+; CHECK-NEXT: .cfi_offset %r14, -24
+; CHECK-NEXT: .cfi_offset %r15, -16
; CHECK-NEXT: movq %rdx, %r11
; CHECK-NEXT: movq %rdi, %r9
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: sarq $63, %rbx
-; CHECK-NEXT: movq %rdx, %rdi
-; CHECK-NEXT: imulq %rbx, %rdi
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: sarq $63, %rdi
+; CHECK-NEXT: movq %rdi, %r10
+; CHECK-NEXT: andq %rdx, %r10
; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: mulq %rbx
+; CHECK-NEXT: mulq %rdi
; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: addq %rdi, %rdx
-; CHECK-NEXT: imulq %rcx, %rbx
-; CHECK-NEXT: addq %rdx, %rbx
-; CHECK-NEXT: movq %rcx, %rdi
-; CHECK-NEXT: sarq $63, %rdi
-; CHECK-NEXT: movq %rdi, %r14
-; CHECK-NEXT: imulq %rsi, %r14
-; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq %rdx, %rbx
+; CHECK-NEXT: subq %r10, %rbx
+; CHECK-NEXT: andq %rcx, %rdi
+; CHECK-NEXT: subq %rdi, %rbx
+; CHECK-NEXT: movq %rcx, %r14
+; CHECK-NEXT: sarq $63, %r14
+; CHECK-NEXT: movq %r14, %r15
+; CHECK-NEXT: andq %rsi, %r15
+; CHECK-NEXT: movq %r14, %rax
; CHECK-NEXT: mulq %r9
; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: addq %r14, %rdx
-; CHECK-NEXT: imulq %r9, %rdi
-; CHECK-NEXT: addq %rdx, %rdi
+; CHECK-NEXT: movq %rdx, %rdi
+; CHECK-NEXT: subq %r15, %rdi
+; CHECK-NEXT: andq %r9, %r14
+; CHECK-NEXT: subq %r14, %rdi
; CHECK-NEXT: addq %r8, %r10
; CHECK-NEXT: adcq %rbx, %rdi
; CHECK-NEXT: movq %r9, %rax
@@ -72,6 +77,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: movq %r9, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
; CHECK-NEXT: retq
; CHECK-NEXT: LBB0_1: ## %overflow
; CHECK-NEXT: ud2
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 996601ed3be64..07debb11b92f7 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: .cfi_def_cfa_offset 28
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: .cfi_def_cfa_offset 32
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -378,52 +378,54 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: subl %ebx, %esi
+; X86-NEXT: andl %ebp, %edi
+; X86-NEXT: subl %edi, %esi
; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: imull %ebp, %ebx
-; X86-NEXT: addl %edx, %ebx
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: andl %ecx, %ebp
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: imull %esi, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: subl %ebp, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %edx, %edi
+; X86-NEXT: subl %edi, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl %ebx, %edx
; X86-NEXT: movl %ebp, %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: xorl %edi, %edx
@@ -434,11 +436,11 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF
; X86-NEXT: orl %edx, %edi
; X86-NEXT: notl %ecx
-; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: cmovel %ebp, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %esi, %edx
-; X86-NEXT: addl $8, %esp
+; X86-NEXT: addl $12, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 367ca660cda14..6631c6c4cc014 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -9,39 +9,44 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
; X64-NEXT: .cfi_def_cfa_offset 24
-; X64-NEXT: pushq %rbx
+; X64-NEXT: pushq %r12
; X64-NEXT: .cfi_def_cfa_offset 32
-; X64-NEXT: .cfi_offset %rbx, -32
+; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 40
+; X64-NEXT: .cfi_offset %rbx, -40
+; X64-NEXT: .cfi_offset %r12, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rdi, %r10
-; X64-NEXT: movq %rsi, %r14
-; X64-NEXT: sarq $63, %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: imulq %r14, %rdi
+; X64-NEXT: movq %rsi, %r9
+; X64-NEXT: sarq $63, %r9
+; X64-NEXT: movq %r9, %r11
+; X64-NEXT: andq %rdx, %r11
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: imulq %rcx, %r14
-; X64-NEXT: addq %rdx, %r14
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: movq %rdi, %r15
-; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: subq %r11, %r14
+; X64-NEXT: andq %rcx, %r9
+; X64-NEXT: subq %r9, %r14
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: movq %r15, %r12
+; X64-NEXT: andq %rsi, %r12
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r10
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r15, %rdx
-; X64-NEXT: imulq %r10, %rdi
-; X64-NEXT: addq %rdx, %rdi
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: adcq %r14, %rdi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: subq %r12, %r9
+; X64-NEXT: andq %r10, %r15
+; X64-NEXT: subq %r15, %r9
+; X64-NEXT: addq %rdi, %r11
+; X64-NEXT: adcq %r14, %r9
; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %rax, %rdi
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rbx
@@ -61,15 +66,16 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: addq %r14, %rax
; X64-NEXT: adcq %rbx, %rdx
; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: adcq %r9, %rdx
; X64-NEXT: movq %r10, 8(%r8)
; X64-NEXT: sarq $63, %r10
; X64-NEXT: xorq %r10, %rdx
; X64-NEXT: xorq %rax, %r10
; X64-NEXT: orq %rdx, %r10
; X64-NEXT: setne %al
-; X64-NEXT: movq %r9, (%r8)
+; X64-NEXT: movq %rdi, (%r8)
; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
; X64-NEXT: retq
@@ -84,8 +90,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $56, %esp
-; X86-NEXT: .cfi_def_cfa_offset 76
+; X86-NEXT: subl $60, %esp
+; X86-NEXT: .cfi_def_cfa_offset 80
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
@@ -99,226 +105,229 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: addl %esi, %edi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl %edi, %esi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: imull %esi, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: imull %esi, %ebp
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT: subl %esi, %edi
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: subl %ebx, %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: subl %esi, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: subl %eax, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl %esi, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl (%esp), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: imull %ebx, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sarl $31, %eax
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %ebx, %ecx
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl %esi, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %eax
-; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: xorl %ecx, %ebp
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl %ebx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -326,7 +335,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $56, %esp
+; X86-NEXT: addl $60, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -360,234 +369,239 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %rcx, %r11
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: movq %rdi, %r11
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rsi, %r10
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r10, %r14
-; X64-NEXT: adcq %rcx, %r12
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq %rsi, %r12
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ecx
-; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r12, %rbx
-; X64-NEXT: adcq %rcx, %r11
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %r12, %rsi
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r8, %rdi
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %r8, %r13
; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %r9, %rsi
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: addq %r13, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r12, %r10
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r15, %r9
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: adcq %r12, %rdi
+; X64-NEXT: setb %r9b
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r10, %r8
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: addq %rdi, %r8
+; X64-NEXT: movzbl %r9b, %eax
; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %rbp
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: adcq $0, %r11
+; X64-NEXT: adcq %rbx, %rbp
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq %r15, %r12
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, %rsi
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r10, %rcx
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r10, %r9
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rdi, %r10
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r10, %r15
; X64-NEXT: adcq %r13, %r11
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: setb %r10b
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rdi
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %r11, %r13
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: addq %r8, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rbp, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %r10b, %eax
+; X64-NEXT: adcq %rax, %rdi
+; X64-NEXT: addq %r8, %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %rbp, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: addq %rbx, %r13
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
-; X64-NEXT: setb %cl
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq %rsi, %r13
+; X64-NEXT: adcq %r12, %rdi
+; X64-NEXT: setb %r11b
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, %r15
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: addq %r8, %rax
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: adcq %rdi, %r9
-; X64-NEXT: setb %r8b
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: adcq %rsi, %r10
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %rbx, %rsi
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r9, %r14
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: addq %r13, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r10, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r10, %rbx
; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r14
+; X64-NEXT: adcq %rax, %rbp
+; X64-NEXT: addq %r13, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %rdi, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movzbl %r11b, %eax
+; X64-NEXT: adcq %rax, %rbx
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %rbx, %r13
-; X64-NEXT: movq %rbx, %r10
+; X64-NEXT: movq %rsi, %r13
; X64-NEXT: sarq $63, %r13
; X64-NEXT: movq %r13, %rcx
-; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: andq %r9, %rcx
; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: imulq %r13, %r15
-; X64-NEXT: addq %rdx, %r15
-; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: subq %rcx, %r10
+; X64-NEXT: andq %r13, %r14
+; X64-NEXT: subq %r14, %r10
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT: andq %r14, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: imulq %rdi, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: imulq %r13, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: subq %rsi, %rcx
+; X64-NEXT: andq %r13, %rdi
+; X64-NEXT: subq %rdi, %rcx
+; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: addq %rax, %r8
-; X64-NEXT: adcq %r15, %rsi
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: adcq %r10, %rcx
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r13
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r9, %r15
+; X64-NEXT: addq %r11, %r15
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: adcq $0, %r13
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq %r9, %r13
-; X64-NEXT: setb %cl
+; X64-NEXT: addq %rsi, %r15
+; X64-NEXT: adcq %r11, %r13
+; X64-NEXT: setb %sil
; X64-NEXT: addq %rax, %r13
-; X64-NEXT: movzbl %cl, %r9d
-; X64-NEXT: adcq %rdx, %r9
+; X64-NEXT: movzbl %sil, %esi
+; X64-NEXT: adcq %rdx, %rsi
; X64-NEXT: addq %r8, %r13
-; X64-NEXT: adcq %rsi, %r9
-; X64-NEXT: sarq $63, %r12
+; X64-NEXT: adcq %rcx, %rsi
+; X64-NEXT: sarq $63, %r9
+; X64-NEXT: movq %r9, %r8
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: imulq %r12, %r8
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: andq %rax, %r8
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %rdx, %r8
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: subq %r8, %r14
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: imulq %r12, %rbx
-; X64-NEXT: addq %r8, %rbx
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: subq %rax, %r14
+; X64-NEXT: movq %r9, %r12
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: imulq %r12, %rcx
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: imulq %r12, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: addq %rsi, %r8
-; X64-NEXT: adcq %rbx, %r10
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: andq %rax, %r12
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: subq %r12, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT: andq %r9, %rax
+; X64-NEXT: subq %rax, %r8
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: adcq %r14, %r8
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: addq %r11, %r14
; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: movq %r9, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: addq %rax, %rbx
+; X64-NEXT: addq %rax, %r14
; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: setb %cl
+; X64-NEXT: setb %r9b
; X64-NEXT: addq %rax, %r11
-; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movzbl %r9b, %eax
; X64-NEXT: adcq %rdx, %rax
-; X64-NEXT: addq %r8, %r11
-; X64-NEXT: adcq %r10, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %rbx
+; X64-NEXT: addq %r10, %r11
+; X64-NEXT: adcq %r8, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT: adcq %r15, %r14
; X64-NEXT: adcq %r13, %r11
-; X64-NEXT: adcq %r9, %rax
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %r11
+; X64-NEXT: adcq %rsi, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; X64-NEXT: adcq %rbx, %r11
; X64-NEXT: adcq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rcx, %rbx
-; X64-NEXT: orq %rax, %rbx
-; X64-NEXT: xorq %rcx, %r11
-; X64-NEXT: xorq %rsi, %rcx
-; X64-NEXT: orq %r11, %rcx
-; X64-NEXT: orq %rbx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: sarq $63, %rdx
+; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: xorq %rdx, %r14
+; X64-NEXT: orq %rax, %r14
+; X64-NEXT: xorq %rdx, %r11
+; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: orq %r11, %rdx
+; X64-NEXT: orq %r14, %rdx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %rdx, 24(%rax)
+; X64-NEXT: movq %rsi, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -613,400 +627,399 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $156, %esp
-; X86-NEXT: .cfi_def_cfa_offset 176
+; X86-NEXT: subl $152, %esp
+; X86-NEXT: .cfi_def_cfa_offset 172
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %ebp
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %ebp, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb %bl
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %bl, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: movl %ebp, %ebx
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %bl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: adcl $0, %edx
@@ -1019,9 +1032,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %ebp, %ecx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %esi
@@ -1034,41 +1047,13 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
@@ -1077,89 +1062,117 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %ebp, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1175,25 +1188,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: addl %ecx, %ebp
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: addl %eax, %ebx
; X86-NEXT: movzbl %cl, %eax
@@ -1201,76 +1214,75 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %al, %edx
; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movl %ebx, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %edx, %ecx
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: movl %edi, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: andl %eax, %ebx
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl %ebx, %ecx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl %esi, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %ebp, %esi
@@ -1280,263 +1292,266 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: setb %bl
; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movzbl %bl, %ebx
+; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: subl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: imull %edi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT: andl %edi, %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: andl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: subl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edi, %ecx
; X86-NEXT: movl %eax, %edx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl %esi, %edx
; X86-NEXT: setb %cl
-; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: addl %eax, %edx
; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: adcl %edx, %edi
; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movzbl %bl, %ebp
+; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: adcl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebp, %ecx
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: setb %al
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: imull %ebp, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %ebp, %eax
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %ebp, %ecx
+; X86-NEXT: andl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: subl %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: subl %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %dl
; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movzbl %dl, %eax
-; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: movzbl %dl, %ecx
+; X86-NEXT: adcl %ebp, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %ebp, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: andl %ebp, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull %ebp, %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edx, %esi
+; X86-NEXT: subl %esi, %ebx
+; X86-NEXT: andl %ebp, %ecx
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull %ebp, %eax
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: subl %ecx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: subl %eax, %ebp
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: setb %cl
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: sarl $31, %edi
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: orl %edx, %esi
-; X86-NEXT: xorl %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: xorl %edi, %edx
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: xorl %edi, %ebp
-; X86-NEXT: orl %eax, %ebp
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: orl %ebp, %edi
-; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebx, 28(%eax)
+; X86-NEXT: movl %ebp, 28(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1552,7 +1567,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 24(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $156, %esp
+; X86-NEXT: addl $152, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index dbec86755a969..641663d9eedfe 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,31 +3297,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: movq %r8, %r14
+; SSE2-NEXT: movq %rcx, %rbp
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rsi, %r11
; SSE2-NEXT: movq %rdi, %r10
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: movq %r11, %r12
-; SSE2-NEXT: sarq $63, %r12
-; SSE2-NEXT: movq %r14, %rbx
-; SSE2-NEXT: imulq %r12, %rbx
+; SSE2-NEXT: movq %r11, %rbx
+; SSE2-NEXT: sarq $63, %rbx
+; SSE2-NEXT: movq %rbx, %r15
+; SSE2-NEXT: andq %r14, %r15
; SSE2-NEXT: movq %r14, %rax
-; SSE2-NEXT: mulq %r12
+; SSE2-NEXT: mulq %rbx
; SSE2-NEXT: movq %rax, %rdi
-; SSE2-NEXT: addq %rbx, %rdx
-; SSE2-NEXT: imulq %r9, %r12
-; SSE2-NEXT: addq %rdx, %r12
-; SSE2-NEXT: movq %r9, %rbx
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rbx, %r13
-; SSE2-NEXT: imulq %r11, %r13
-; SSE2-NEXT: movq %rbx, %rax
+; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: subq %r15, %r12
+; SSE2-NEXT: andq %r9, %rbx
+; SSE2-NEXT: subq %rbx, %r12
+; SSE2-NEXT: movq %r9, %r13
+; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: movq %r13, %rcx
+; SSE2-NEXT: andq %r11, %rcx
+; SSE2-NEXT: movq %r13, %rax
; SSE2-NEXT: mulq %r10
; SSE2-NEXT: movq %rax, %r15
-; SSE2-NEXT: addq %r13, %rdx
-; SSE2-NEXT: imulq %r10, %rbx
-; SSE2-NEXT: addq %rdx, %rbx
+; SSE2-NEXT: movq %rdx, %rbx
+; SSE2-NEXT: subq %rcx, %rbx
+; SSE2-NEXT: andq %r10, %r13
+; SSE2-NEXT: subq %r13, %rbx
; SSE2-NEXT: addq %rdi, %r15
; SSE2-NEXT: adcq %r12, %rbx
; SSE2-NEXT: movq %r10, %rax
@@ -3341,11 +3343,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: addq %r13, %r10
; SSE2-NEXT: adcq %r14, %r12
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %r14d
+; SSE2-NEXT: movzbl %al, %ecx
; SSE2-NEXT: movq %r11, %rax
; SSE2-NEXT: mulq %r9
; SSE2-NEXT: addq %r12, %rax
-; SSE2-NEXT: adcq %r14, %rdx
+; SSE2-NEXT: adcq %rcx, %rdx
; SSE2-NEXT: addq %r15, %rax
; SSE2-NEXT: adcq %rbx, %rdx
; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3356,52 +3358,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: xorl %r15d, %r15d
; SSE2-NEXT: orq %rdx, %r10
; SSE2-NEXT: setne %r15b
-; SSE2-NEXT: movq %rcx, %rbx
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: movq %rsi, %r10
-; SSE2-NEXT: imulq %rbx, %r10
+; SSE2-NEXT: movq %rbp, %rcx
+; SSE2-NEXT: sarq $63, %rcx
+; SSE2-NEXT: movq %rcx, %r11
+; SSE2-NEXT: andq %rsi, %r11
; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: mulq %rbx
+; SSE2-NEXT: mulq %rcx
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %r10, %rdx
-; SSE2-NEXT: imulq %rbp, %rbx
-; SSE2-NEXT: addq %rdx, %rbx
-; SSE2-NEXT: movq %rbp, %r10
-; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: movq %r10, %r14
-; SSE2-NEXT: imulq %rcx, %r14
-; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %r8
+; SSE2-NEXT: movq %rdx, %r10
+; SSE2-NEXT: subq %r11, %r10
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: andq %rax, %rcx
+; SSE2-NEXT: subq %rcx, %r10
; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: addq %r14, %rdx
-; SSE2-NEXT: imulq %r8, %r10
-; SSE2-NEXT: addq %rdx, %r10
-; SSE2-NEXT: addq %r9, %r11
-; SSE2-NEXT: adcq %rbx, %r10
+; SSE2-NEXT: movq %rax, %r13
+; SSE2-NEXT: sarq $63, %r11
+; SSE2-NEXT: movq %r11, %rcx
+; SSE2-NEXT: andq %rbp, %rcx
+; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: mulq %r8
+; SSE2-NEXT: movq %rax, %rbx
+; SSE2-NEXT: movq %rdx, %r14
+; SSE2-NEXT: subq %rcx, %r14
+; SSE2-NEXT: andq %r8, %r11
+; SSE2-NEXT: subq %r11, %r14
+; SSE2-NEXT: addq %r9, %rbx
+; SSE2-NEXT: adcq %r10, %r14
; SSE2-NEXT: movq %r8, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: movq %rbp, %rax
; SSE2-NEXT: mulq %rsi
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: addq %r9, %r14
+; SSE2-NEXT: movq %rax, %r11
+; SSE2-NEXT: addq %r9, %r11
; SSE2-NEXT: adcq $0, %rsi
; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %rbp
+; SSE2-NEXT: mulq %r13
; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rax, %r9
-; SSE2-NEXT: addq %r14, %r9
+; SSE2-NEXT: addq %r11, %r9
; SSE2-NEXT: adcq %rsi, %r8
; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %esi
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rbp
+; SSE2-NEXT: movzbl %al, %ecx
+; SSE2-NEXT: movq %rbp, %rax
+; SSE2-NEXT: mulq %r13
; SSE2-NEXT: addq %r8, %rax
-; SSE2-NEXT: adcq %rsi, %rdx
-; SSE2-NEXT: addq %r11, %rax
-; SSE2-NEXT: adcq %r10, %rdx
+; SSE2-NEXT: adcq %rcx, %rdx
+; SSE2-NEXT: addq %rbx, %rax
+; SSE2-NEXT: adcq %r14, %rdx
; SSE2-NEXT: movq %r9, 24(%r12)
; SSE2-NEXT: sarq $63, %r9
; SSE2-NEXT: xorq %r9, %rdx
@@ -3414,7 +3420,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: negl %r15d
; SSE2-NEXT: movd %r15d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %rbx, 16(%r12)
+; SSE2-NEXT: movq %r10, 16(%r12)
; SSE2-NEXT: movq %rdi, (%r12)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
@@ -3433,31 +3439,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
; SSSE3-NEXT: movq %r8, %r14
+; SSSE3-NEXT: movq %rcx, %rbp
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rsi, %r11
; SSSE3-NEXT: movq %rdi, %r10
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT: movq %r11, %r12
-; SSSE3-NEXT: sarq $63, %r12
-; SSSE3-NEXT: movq %r14, %rbx
-; SSSE3-NEXT: imulq %r12, %rbx
+; SSSE3-NEXT: movq %r11, %rbx
+; SSSE3-NEXT: sarq $63, %rbx
+; SSSE3-NEXT: movq %rbx, %r15
+; SSSE3-NEXT: andq %r14, %r15
; SSSE3-NEXT: movq %r14, %rax
-; SSSE3-NEXT: mulq %r12
+; SSSE3-NEXT: mulq %rbx
; SSSE3-NEXT: movq %rax, %rdi
-; SSSE3-NEXT: addq %rbx, %rdx
-; SSSE3-NEXT: imulq %r9, %r12
-; SSSE3-NEXT: addq %rdx, %r12
-; SSSE3-NEXT: movq %r9, %rbx
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rbx, %r13
-; SSSE3-NEXT: imulq %r11, %r13
-; SSSE3-NEXT: movq %rbx, %rax
+; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: subq %r15, %r12
+; SSSE3-NEXT: andq %r9, %rbx
+; SSSE3-NEXT: subq %rbx, %r12
+; SSSE3-NEXT: movq %r9, %r13
+; SSSE3-NEXT: sarq $63, %r13
+; SSSE3-NEXT: movq %r13, %rcx
+; SSSE3-NEXT: andq %r11, %rcx
+; SSSE3-NEXT: movq %r13, %rax
; SSSE3-NEXT: mulq %r10
; SSSE3-NEXT: movq %rax, %r15
-; SSSE3-NEXT: addq %r13, %rdx
-; SSSE3-NEXT: imulq %r10, %rbx
-; SSSE3-NEXT: addq %rdx, %rbx
+; SSSE3-NEXT: movq %rdx, %rbx
+; SSSE3-NEXT: subq %rcx, %rbx
+; SSSE3-NEXT: andq %r10, %r13
+; SSSE3-NEXT: subq %r13, %rbx
; SSSE3-NEXT: addq %rdi, %r15
; SSSE3-NEXT: adcq %r12, %rbx
; SSSE3-NEXT: movq %r10, %rax
@@ -3477,11 +3485,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: addq %r13, %r10
; SSSE3-NEXT: adcq %r14, %r12
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %r14d
+; SSSE3-NEXT: movzbl %al, %ecx
; SSSE3-NEXT: movq %r11, %rax
; SSSE3-NEXT: mulq %r9
; SSSE3-NEXT: addq %r12, %rax
-; SSSE3-NEXT: adcq %r14, %rdx
+; SSSE3-NEXT: adcq %rcx, %rdx
; SSSE3-NEXT: addq %r15, %rax
; SSSE3-NEXT: adcq %rbx, %rdx
; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3492,52 +3500,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: xorl %r15d, %r15d
; SSSE3-NEXT: orq %rdx, %r10
; SSSE3-NEXT: setne %r15b
-; SSSE3-NEXT: movq %rcx, %rbx
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: movq %rsi, %r10
-; SSSE3-NEXT: imulq %rbx, %r10
+; SSSE3-NEXT: movq %rbp, %rcx
+; SSSE3-NEXT: sarq $63, %rcx
+; SSSE3-NEXT: movq %rcx, %r11
+; SSSE3-NEXT: andq %rsi, %r11
; SSSE3-NEXT: movq %rsi, %rax
-; SSSE3-NEXT: mulq %rbx
+; SSSE3-NEXT: mulq %rcx
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %r10, %rdx
-; SSSE3-NEXT: imulq %rbp, %rbx
-; SSSE3-NEXT: addq %rdx, %rbx
-; SSSE3-NEXT: movq %rbp, %r10
-; SSSE3-NEXT: sarq $63, %r10
-; SSSE3-NEXT: movq %r10, %r14
-; SSSE3-NEXT: imulq %rcx, %r14
-; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %r8
+; SSSE3-NEXT: movq %rdx, %r10
+; SSSE3-NEXT: subq %r11, %r10
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: andq %rax, %rcx
+; SSSE3-NEXT: subq %rcx, %r10
; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: addq %r14, %rdx
-; SSSE3-NEXT: imulq %r8, %r10
-; SSSE3-NEXT: addq %rdx, %r10
-; SSSE3-NEXT: addq %r9, %r11
-; SSSE3-NEXT: adcq %rbx, %r10
+; SSSE3-NEXT: movq %rax, %r13
+; SSSE3-NEXT: sarq $63, %r11
+; SSSE3-NEXT: movq %r11, %rcx
+; SSSE3-NEXT: andq %rbp, %rcx
+; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: mulq %r8
+; SSSE3-NEXT: movq %rax, %rbx
+; SSSE3-NEXT: movq %rdx, %r14
+; SSSE3-NEXT: subq %rcx, %r14
+; SSSE3-NEXT: andq %r8, %r11
+; SSSE3-NEXT: subq %r11, %r14
+; SSSE3-NEXT: addq %r9, %rbx
+; SSSE3-NEXT: adcq %r10, %r14
; SSSE3-NEXT: movq %r8, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: movq %rcx, %rax
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: movq %rbp, %rax
; SSSE3-NEXT: mulq %rsi
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: addq %r9, %r14
+; SSSE3-NEXT: movq %rax, %r11
+; SSSE3-NEXT: addq %r9, %r11
; SSSE3-NEXT: adcq $0, %rsi
; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %rbp
+; SSSE3-NEXT: mulq %r13
; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rax, %r9
-; SSSE3-NEXT: addq %r14, %r9
+; SSSE3-NEXT: addq %r11, %r9
; SSSE3-NEXT: adcq %rsi, %r8
; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %esi
-; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: mulq %rbp
+; SSSE3-NEXT: movzbl %al, %ecx
+; SSSE3-NEXT: movq %rbp, %rax
+; SSSE3-NEXT: mulq %r13
; SSSE3-NEXT: addq %r8, %rax
-; SSSE3-NEXT: adcq %rsi, %rdx
-; SSSE3-NEXT: addq %r11, %rax
-; SSSE3-NEXT: adcq %r10, %rdx
+; SSSE3-NEXT: adcq %rcx, %rdx
+; SSSE3-NEXT: addq %rbx, %rax
+; SSSE3-NEXT: adcq %r14, %rdx
; SSSE3-NEXT: movq %r9, 24(%r12)
; SSSE3-NEXT: sarq $63, %r9
; SSSE3-NEXT: xorq %r9, %rdx
@@ -3550,7 +3562,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: negl %r15d
; SSSE3-NEXT: movd %r15d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %rbx, 16(%r12)
+; SSSE3-NEXT: movq %r10, 16(%r12)
; SSSE3-NEXT: movq %rdi, (%r12)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
@@ -3569,31 +3581,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
; SSE41-NEXT: movq %r8, %r14
+; SSE41-NEXT: movq %rcx, %rbp
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rsi, %r11
; SSE41-NEXT: movq %rdi, %r10
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movq %r11, %r12
-; SSE41-NEXT: sarq $63, %r12
-; SSE41-NEXT: movq %r14, %rbx
-; SSE41-NEXT: imulq %r12, %rbx
+; SSE41-NEXT: movq %r11, %rbx
+; SSE41-NEXT: sarq $63, %rbx
+; SSE41-NEXT: movq %rbx, %r15
+; SSE41-NEXT: andq %r14, %r15
; SSE41-NEXT: movq %r14, %rax
-; SSE41-NEXT: mulq %r12
+; SSE41-NEXT: mulq %rbx
; SSE41-NEXT: movq %rax, %rdi
-; SSE41-NEXT: addq %rbx, %rdx
-; SSE41-NEXT: imulq %r9, %r12
-; SSE41-NEXT: addq %rdx, %r12
-; SSE41-NEXT: movq %r9, %rbx
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rbx, %r13
-; SSE41-NEXT: imulq %r11, %r13
-; SSE41-NEXT: movq %rbx, %rax
+; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: subq %r15, %r12
+; SSE41-NEXT: andq %r9, %rbx
+; SSE41-NEXT: subq %rbx, %r12
+; SSE41-NEXT: movq %r9, %r13
+; SSE41-NEXT: sarq $63, %r13
+; SSE41-NEXT: movq %r13, %rcx
+; SSE41-NEXT: andq %r11, %rcx
+; SSE41-NEXT: movq %r13, %rax
; SSE41-NEXT: mulq %r10
; SSE41-NEXT: movq %rax, %r15
-; SSE41-NEXT: addq %r13, %rdx
-; SSE41-NEXT: imulq %r10, %rbx
-; SSE41-NEXT: addq %rdx, %rbx
+; SSE41-NEXT: movq %rdx, %rbx
+; SSE41-NEXT: subq %rcx, %rbx
+; SSE41-NEXT: andq %r10, %r13
+; SSE41-NEXT: subq %r13, %rbx
; SSE41-NEXT: addq %rdi, %r15
; SSE41-NEXT: adcq %r12, %rbx
; SSE41-NEXT: movq %r10, %rax
@@ -3613,11 +3627,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: addq %r13, %r10
; SSE41-NEXT: adcq %r14, %r12
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %r14d
+; SSE41-NEXT: movzbl %al, %ecx
; SSE41-NEXT: movq %r11, %rax
; SSE41-NEXT: mulq %r9
; SSE41-NEXT: addq %r12, %rax
-; SSE41-NEXT: adcq %r14, %rdx
+; SSE41-NEXT: adcq %rcx, %rdx
; SSE41-NEXT: addq %r15, %rax
; SSE41-NEXT: adcq %rbx, %rdx
; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3628,52 +3642,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: xorl %r15d, %r15d
; SSE41-NEXT: orq %rdx, %r10
; SSE41-NEXT: setne %r15b
-; SSE41-NEXT: movq %rcx, %rbx
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: movq %rsi, %r10
-; SSE41-NEXT: imulq %rbx, %r10
+; SSE41-NEXT: movq %rbp, %rcx
+; SSE41-NEXT: sarq $63, %rcx
+; SSE41-NEXT: movq %rcx, %r11
+; SSE41-NEXT: andq %rsi, %r11
; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: mulq %rbx
+; SSE41-NEXT: mulq %rcx
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %r10, %rdx
-; SSE41-NEXT: imulq %rbp, %rbx
-; SSE41-NEXT: addq %rdx, %rbx
-; SSE41-NEXT: movq %rbp, %r10
-; SSE41-NEXT: sarq $63, %r10
-; SSE41-NEXT: movq %r10, %r14
-; SSE41-NEXT: imulq %rcx, %r14
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r8
+; SSE41-NEXT: movq %rdx, %r10
+; SSE41-NEXT: subq %r11, %r10
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT: andq %rax, %rcx
+; SSE41-NEXT: subq %rcx, %r10
; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: addq %r14, %rdx
-; SSE41-NEXT: imulq %r8, %r10
-; SSE41-NEXT: addq %rdx, %r10
-; SSE41-NEXT: addq %r9, %r11
-; SSE41-NEXT: adcq %rbx, %r10
+; SSE41-NEXT: movq %rax, %r13
+; SSE41-NEXT: sarq $63, %r11
+; SSE41-NEXT: movq %r11, %rcx
+; SSE41-NEXT: andq %rbp, %rcx
+; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: mulq %r8
+; SSE41-NEXT: movq %rax, %rbx
+; SSE41-NEXT: movq %rdx, %r14
+; SSE41-NEXT: subq %rcx, %r14
+; SSE41-NEXT: andq %r8, %r11
+; SSE41-NEXT: subq %r11, %r14
+; SSE41-NEXT: addq %r9, %rbx
+; SSE41-NEXT: adcq %r10, %r14
; SSE41-NEXT: movq %r8, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %r9
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: movq %rbp, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: addq %r9, %r14
+; SSE41-NEXT: movq %rax, %r11
+; SSE41-NEXT: addq %r9, %r11
; SSE41-NEXT: adcq $0, %rsi
; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: mulq %rbp
+; SSE41-NEXT: mulq %r13
; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rax, %r9
-; SSE41-NEXT: addq %r14, %r9
+; SSE41-NEXT: addq %r11, %r9
; SSE41-NEXT: adcq %rsi, %r8
; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %esi
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rbp
+; SSE41-NEXT: movzbl %al, %ecx
+; SSE41-NEXT: movq %rbp, %rax
+; SSE41-NEXT: mulq %r13
; SSE41-NEXT: addq %r8, %rax
-; SSE41-NEXT: adcq %rsi, %rdx
-; SSE41-NEXT: addq %r11, %rax
-; SSE41-NEXT: adcq %r10, %rdx
+; SSE41-NEXT: adcq %rcx, %rdx
+; SSE41-NEXT: addq %rbx, %rax
+; SSE41-NEXT: adcq %r14, %rdx
; SSE41-NEXT: movq %r9, 24(%r12)
; SSE41-NEXT: sarq $63, %r9
; SSE41-NEXT: xorq %r9, %rdx
@@ -3685,7 +3703,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: negl %r15d
; SSE41-NEXT: movd %r15d, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %rbx, 16(%r12)
+; SSE41-NEXT: movq %r10, 16(%r12)
; SSE41-NEXT: movq %rdi, (%r12)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
@@ -3704,31 +3722,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %r8, %r14
+; AVX-NEXT: movq %rcx, %rbp
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rsi, %r11
; AVX-NEXT: movq %rdi, %r10
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT: movq %r11, %r12
-; AVX-NEXT: sarq $63, %r12
-; AVX-NEXT: movq %r14, %rbx
-; AVX-NEXT: imulq %r12, %rbx
+; AVX-NEXT: movq %r11, %rbx
+; AVX-NEXT: sarq $63, %rbx
+; AVX-NEXT: movq %rbx, %r15
+; AVX-NEXT: andq %r14, %r15
; AVX-NEXT: movq %r14, %rax
-; AVX-NEXT: mulq %r12
+; AVX-NEXT: mulq %rbx
; AVX-NEXT: movq %rax, %rdi
-; AVX-NEXT: addq %rbx, %rdx
-; AVX-NEXT: imulq %r9, %r12
-; AVX-NEXT: addq %rdx, %r12
-; AVX-NEXT: movq %r9, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rbx, %r13
-; AVX-NEXT: imulq %r11, %r13
-; AVX-NEXT: movq %rbx, %rax
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: subq %r15, %r12
+; AVX-NEXT: andq %r9, %rbx
+; AVX-NEXT: subq %rbx, %r12
+; AVX-NEXT: movq %r9, %r13
+; AVX-NEXT: sarq $63, %r13
+; AVX-NEXT: movq %r13, %rcx
+; AVX-NEXT: andq %r11, %rcx
+; AVX-NEXT: movq %r13, %rax
; AVX-NEXT: mulq %r10
; AVX-NEXT: movq %rax, %r15
-; AVX-NEXT: addq %r13, %rdx
-; AVX-NEXT: imulq %r10, %rbx
-; AVX-NEXT: addq %rdx, %rbx
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: subq %rcx, %rbx
+; AVX-NEXT: andq %r10, %r13
+; AVX-NEXT: subq %r13, %rbx
; AVX-NEXT: addq %rdi, %r15
; AVX-NEXT: adcq %r12, %rbx
; AVX-NEXT: movq %r10, %rax
@@ -3748,11 +3768,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: addq %r13, %r10
; AVX-NEXT: adcq %r14, %r12
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %r14d
+; AVX-NEXT: movzbl %al, %ecx
; AVX-NEXT: movq %r11, %rax
; AVX-NEXT: mulq %r9
; AVX-NEXT: addq %r12, %rax
-; AVX-NEXT: adcq %r14, %rdx
+; AVX-NEXT: adcq %rcx, %rdx
; AVX-NEXT: addq %r15, %rax
; AVX-NEXT: adcq %rbx, %rdx
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
@@ -3763,52 +3783,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: xorl %r15d, %r15d
; AVX-NEXT: orq %rdx, %r10
; AVX-NEXT: setne %r15b
-; AVX-NEXT: movq %rcx, %rbx
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: movq %rsi, %r10
-; AVX-NEXT: imulq %rbx, %r10
+; AVX-NEXT: movq %rbp, %rcx
+; AVX-NEXT: sarq $63, %rcx
+; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: andq %rsi, %r11
; AVX-NEXT: movq %rsi, %rax
-; AVX-NEXT: mulq %rbx
+; AVX-NEXT: mulq %rcx
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %r10, %rdx
-; AVX-NEXT: imulq %rbp, %rbx
-; AVX-NEXT: addq %rdx, %rbx
-; AVX-NEXT: movq %rbp, %r10
-; AVX-NEXT: sarq $63, %r10
-; AVX-NEXT: movq %r10, %r14
-; AVX-NEXT: imulq %rcx, %r14
-; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r8
+; AVX-NEXT: movq %rdx, %r10
+; AVX-NEXT: subq %r11, %r10
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: andq %rax, %rcx
+; AVX-NEXT: subq %rcx, %r10
; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: addq %r14, %rdx
-; AVX-NEXT: imulq %r8, %r10
-; AVX-NEXT: addq %rdx, %r10
-; AVX-NEXT: addq %r9, %r11
-; AVX-NEXT: adcq %rbx, %r10
+; AVX-NEXT: movq %rax, %r13
+; AVX-NEXT: sarq $63, %r11
+; AVX-NEXT: movq %r11, %rcx
+; AVX-NEXT: andq %rbp, %rcx
+; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: mulq %r8
+; AVX-NEXT: movq %rax, %rbx
+; AVX-NEXT: movq %rdx, %r14
+; AVX-NEXT: subq %rcx, %r14
+; AVX-NEXT: andq %r8, %r11
+; AVX-NEXT: subq %r11, %r14
+; AVX-NEXT: addq %r9, %rbx
+; AVX-NEXT: adcq %r10, %r14
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: movq %rbp, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: addq %r9, %r14
+; AVX-NEXT: movq %rax, %r11
+; AVX-NEXT: addq %r9, %r11
; AVX-NEXT: adcq $0, %rsi
; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %rbp
+; AVX-NEXT: mulq %r13
; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: addq %r14, %r9
+; AVX-NEXT: addq %r11, %r9
; AVX-NEXT: adcq %rsi, %r8
; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %esi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rbp
+; AVX-NEXT: movzbl %al, %ecx
+; AVX-NEXT: movq %rbp, %rax
+; AVX-NEXT: mulq %r13
; AVX-NEXT: addq %r8, %rax
-; AVX-NEXT: adcq %rsi, %rdx
-; AVX-NEXT: addq %r11, %rax
-; AVX-NEXT: adcq %r10, %rdx
+; AVX-NEXT: adcq %rcx, %rdx
+; AVX-NEXT: addq %rbx, %rax
+; AVX-NEXT: adcq %r14, %rdx
; AVX-NEXT: movq %r9, 24(%r12)
; AVX-NEXT: sarq $63, %r9
; AVX-NEXT: xorq %r9, %rdx
@@ -3820,7 +3844,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: negl %r15d
; AVX-NEXT: vmovd %r15d, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %rbx, 16(%r12)
+; AVX-NEXT: movq %r10, 16(%r12)
; AVX-NEXT: movq %rdi, (%r12)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
@@ -3838,32 +3862,35 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %r9, %rbp
+; AVX512F-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512F-NEXT: movq %rcx, %r11
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rsi, %r9
+; AVX512F-NEXT: movq %rsi, %rbp
+; AVX512F-NEXT: movq %rdi, %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %rcx, %r12
-; AVX512F-NEXT: sarq $63, %r12
-; AVX512F-NEXT: movq %r15, %rbx
-; AVX512F-NEXT: imulq %r12, %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: movq %rcx, %rbx
+; AVX512F-NEXT: sarq $63, %rbx
+; AVX512F-NEXT: movq %rbx, %r14
+; AVX512F-NEXT: andq %r15, %r14
; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %r12
+; AVX512F-NEXT: mulq %rbx
; AVX512F-NEXT: movq %rax, %rcx
-; AVX512F-NEXT: addq %rbx, %rdx
-; AVX512F-NEXT: imulq %rsi, %r12
-; AVX512F-NEXT: addq %rdx, %r12
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: sarq $63, %rbx
-; AVX512F-NEXT: movq %rbx, %r13
-; AVX512F-NEXT: imulq %r11, %r13
-; AVX512F-NEXT: movq %rbx, %rax
+; AVX512F-NEXT: movq %rdx, %r12
+; AVX512F-NEXT: subq %r14, %r12
+; AVX512F-NEXT: andq %rdi, %rbx
+; AVX512F-NEXT: subq %rbx, %r12
+; AVX512F-NEXT: movq %rdi, %r13
+; AVX512F-NEXT: sarq $63, %r13
+; AVX512F-NEXT: movq %r13, %rsi
+; AVX512F-NEXT: andq %r11, %rsi
+; AVX512F-NEXT: movq %r13, %rax
; AVX512F-NEXT: mulq %r10
; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: addq %r13, %rdx
-; AVX512F-NEXT: imulq %r10, %rbx
-; AVX512F-NEXT: addq %rdx, %rbx
+; AVX512F-NEXT: movq %rdx, %rbx
+; AVX512F-NEXT: subq %rsi, %rbx
+; AVX512F-NEXT: andq %r10, %r13
+; AVX512F-NEXT: subq %r13, %rbx
; AVX512F-NEXT: addq %rcx, %r14
; AVX512F-NEXT: adcq %r12, %rbx
; AVX512F-NEXT: movq %r10, %rax
@@ -3877,74 +3904,78 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: addq %r12, %r13
; AVX512F-NEXT: adcq $0, %r15
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: mulq %rdi
; AVX512F-NEXT: movq %rdx, %r12
; AVX512F-NEXT: movq %rax, %r10
; AVX512F-NEXT: addq %r13, %r10
; AVX512F-NEXT: adcq %r15, %r12
; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %r15d
+; AVX512F-NEXT: movzbl %al, %esi
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rsi
+; AVX512F-NEXT: mulq %rdi
; AVX512F-NEXT: addq %r12, %rax
-; AVX512F-NEXT: adcq %r15, %rdx
+; AVX512F-NEXT: adcq %rsi, %rdx
; AVX512F-NEXT: addq %r14, %rax
; AVX512F-NEXT: adcq %rbx, %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: movq %r10, 24(%r12)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX512F-NEXT: movq %r10, 24(%r13)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %r9, %rsi
+; AVX512F-NEXT: movq %rbp, %rsi
; AVX512F-NEXT: sarq $63, %rsi
-; AVX512F-NEXT: movq %r8, %r11
-; AVX512F-NEXT: imulq %rsi, %r11
+; AVX512F-NEXT: movq %rsi, %rdi
+; AVX512F-NEXT: andq %r8, %rdi
; AVX512F-NEXT: movq %r8, %rax
; AVX512F-NEXT: mulq %rsi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r11, %rdx
-; AVX512F-NEXT: imulq %rbp, %rsi
-; AVX512F-NEXT: addq %rdx, %rsi
-; AVX512F-NEXT: movq %rbp, %r11
-; AVX512F-NEXT: sarq $63, %r11
-; AVX512F-NEXT: movq %r11, %r14
-; AVX512F-NEXT: imulq %r9, %r14
-; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rdi
+; AVX512F-NEXT: movq %rdx, %r11
+; AVX512F-NEXT: subq %rdi, %r11
+; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT: andq %rax, %rsi
+; AVX512F-NEXT: subq %rsi, %r11
; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: addq %r14, %rdx
-; AVX512F-NEXT: imulq %rdi, %r11
-; AVX512F-NEXT: addq %rdx, %r11
-; AVX512F-NEXT: addq %r10, %rbx
-; AVX512F-NEXT: adcq %rsi, %r11
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: movq %rdx, %r10
+; AVX512F-NEXT: movq %rax, %r12
+; AVX512F-NEXT: sarq $63, %rbx
+; AVX512F-NEXT: movq %rbx, %rsi
+; AVX512F-NEXT: andq %rbp, %rsi
+; AVX512F-NEXT: movq %rbx, %rax
+; AVX512F-NEXT: mulq %r9
; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq %rdx, %r15
+; AVX512F-NEXT: subq %rsi, %r15
+; AVX512F-NEXT: andq %r9, %rbx
+; AVX512F-NEXT: subq %rbx, %r15
+; AVX512F-NEXT: addq %r10, %r14
+; AVX512F-NEXT: adcq %r11, %r15
; AVX512F-NEXT: movq %r9, %rax
; AVX512F-NEXT: mulq %r8
+; AVX512F-NEXT: movq %rdx, %r10
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: movq %rbp, %rax
+; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rax, %r15
-; AVX512F-NEXT: addq %r10, %r15
+; AVX512F-NEXT: movq %rax, %rbx
+; AVX512F-NEXT: addq %r10, %rbx
; AVX512F-NEXT: adcq $0, %r8
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: mulq %r12
; AVX512F-NEXT: movq %rdx, %rdi
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r15, %r10
+; AVX512F-NEXT: addq %rbx, %r10
; AVX512F-NEXT: adcq %r8, %rdi
; AVX512F-NEXT: setb %al
; AVX512F-NEXT: movzbl %al, %esi
-; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movq %rbp, %rax
+; AVX512F-NEXT: mulq %r12
; AVX512F-NEXT: addq %rdi, %rax
; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: addq %rbx, %rax
-; AVX512F-NEXT: adcq %r11, %rdx
-; AVX512F-NEXT: movq %r10, 8(%r12)
+; AVX512F-NEXT: addq %r14, %rax
+; AVX512F-NEXT: adcq %r15, %rdx
+; AVX512F-NEXT: movq %r10, 8(%r13)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
; AVX512F-NEXT: xorq %rax, %r10
@@ -3956,8 +3987,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq %rcx, 16(%r12)
-; AVX512F-NEXT: movq %r14, (%r12)
+; AVX512F-NEXT: movq %rcx, 16(%r13)
+; AVX512F-NEXT: movq %r11, (%r13)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
@@ -3974,32 +4005,35 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: movq %r9, %rbp
+; AVX512BW-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: movq %rcx, %r11
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rsi, %r9
+; AVX512BW-NEXT: movq %rsi, %rbp
+; AVX512BW-NEXT: movq %rdi, %r9
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512BW-NEXT: movq %rcx, %r12
-; AVX512BW-NEXT: sarq $63, %r12
-; AVX512BW-NEXT: movq %r15, %rbx
-; AVX512BW-NEXT: imulq %r12, %rbx
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512BW-NEXT: movq %rcx, %rbx
+; AVX512BW-NEXT: sarq $63, %rbx
+; AVX512BW-NEXT: movq %rbx, %r14
+; AVX512BW-NEXT: andq %r15, %r14
; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %r12
+; AVX512BW-NEXT: mulq %rbx
; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: addq %rbx, %rdx
-; AVX512BW-NEXT: imulq %rsi, %r12
-; AVX512BW-NEXT: addq %rdx, %r12
-; AVX512BW-NEXT: movq %rsi, %rbx
-; AVX512BW-NEXT: sarq $63, %rbx
-; AVX512BW-NEXT: movq %rbx, %r13
-; AVX512BW-NEXT: imulq %r11, %r13
-; AVX512BW-NEXT: movq %rbx, %rax
+; AVX512BW-NEXT: movq %rdx, %r12
+; AVX512BW-NEXT: subq %r14, %r12
+; AVX512BW-NEXT: andq %rdi, %rbx
+; AVX512BW-NEXT: subq %rbx, %r12
+; AVX512BW-NEXT: movq %rdi, %r13
+; AVX512BW-NEXT: sarq $63, %r13
+; AVX512BW-NEXT: movq %r13, %rsi
+; AVX512BW-NEXT: andq %r11, %rsi
+; AVX512BW-NEXT: movq %r13, %rax
; AVX512BW-NEXT: mulq %r10
; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: addq %r13, %rdx
-; AVX512BW-NEXT: imulq %r10, %rbx
-; AVX512BW-NEXT: addq %rdx, %rbx
+; AVX512BW-NEXT: movq %rdx, %rbx
+; AVX512BW-NEXT: subq %rsi, %rbx
+; AVX512BW-NEXT: andq %r10, %r13
+; AVX512BW-NEXT: subq %r13, %rbx
; AVX512BW-NEXT: addq %rcx, %r14
; AVX512BW-NEXT: adcq %r12, %rbx
; AVX512BW-NEXT: movq %r10, %rax
@@ -4013,74 +4047,78 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: addq %r12, %r13
; AVX512BW-NEXT: adcq $0, %r15
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: mulq %rdi
; AVX512BW-NEXT: movq %rdx, %r12
; AVX512BW-NEXT: movq %rax, %r10
; AVX512BW-NEXT: addq %r13, %r10
; AVX512BW-NEXT: adcq %r15, %r12
; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %r15d
+; AVX512BW-NEXT: movzbl %al, %esi
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rsi
+; AVX512BW-NEXT: mulq %rdi
; AVX512BW-NEXT: addq %r12, %rax
-; AVX512BW-NEXT: adcq %r15, %rdx
+; AVX512BW-NEXT: adcq %rsi, %rdx
; AVX512BW-NEXT: addq %r14, %rax
; AVX512BW-NEXT: adcq %rbx, %rdx
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512BW-NEXT: movq %r10, 24(%r12)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX512BW-NEXT: movq %r10, 24(%r13)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %r9, %rsi
+; AVX512BW-NEXT: movq %rbp, %rsi
; AVX512BW-NEXT: sarq $63, %rsi
-; AVX512BW-NEXT: movq %r8, %r11
-; AVX512BW-NEXT: imulq %rsi, %r11
+; AVX512BW-NEXT: movq %rsi, %rdi
+; AVX512BW-NEXT: andq %r8, %rdi
; AVX512BW-NEXT: movq %r8, %rax
; AVX512BW-NEXT: mulq %rsi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r11, %rdx
-; AVX512BW-NEXT: imulq %rbp, %rsi
-; AVX512BW-NEXT: addq %rdx, %rsi
-; AVX512BW-NEXT: movq %rbp, %r11
-; AVX512BW-NEXT: sarq $63, %r11
-; AVX512BW-NEXT: movq %r11, %r14
-; AVX512BW-NEXT: imulq %r9, %r14
-; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rdi
+; AVX512BW-NEXT: movq %rdx, %r11
+; AVX512BW-NEXT: subq %rdi, %r11
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: andq %rax, %rsi
+; AVX512BW-NEXT: subq %rsi, %r11
; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: addq %r14, %rdx
-; AVX512BW-NEXT: imulq %rdi, %r11
-; AVX512BW-NEXT: addq %rdx, %r11
-; AVX512BW-NEXT: addq %r10, %rbx
-; AVX512BW-NEXT: adcq %rsi, %r11
-; AVX512BW-NEXT: movq %rdi, %rax
-; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: movq %rdx, %r10
+; AVX512BW-NEXT: movq %rax, %r12
+; AVX512BW-NEXT: sarq $63, %rbx
+; AVX512BW-NEXT: movq %rbx, %rsi
+; AVX512BW-NEXT: andq %rbp, %rsi
+; AVX512BW-NEXT: movq %rbx, %rax
+; AVX512BW-NEXT: mulq %r9
; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq %rdx, %r15
+; AVX512BW-NEXT: subq %rsi, %r15
+; AVX512BW-NEXT: andq %r9, %rbx
+; AVX512BW-NEXT: subq %rbx, %r15
+; AVX512BW-NEXT: addq %r10, %r14
+; AVX512BW-NEXT: adcq %r11, %r15
; AVX512BW-NEXT: movq %r9, %rax
; AVX512BW-NEXT: mulq %r8
+; AVX512BW-NEXT: movq %rdx, %r10
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: movq %rbp, %rax
+; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r8
-; AVX512BW-NEXT: movq %rax, %r15
-; AVX512BW-NEXT: addq %r10, %r15
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: addq %r10, %rbx
; AVX512BW-NEXT: adcq $0, %r8
-; AVX512BW-NEXT: movq %rdi, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: mulq %r12
; AVX512BW-NEXT: movq %rdx, %rdi
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r15, %r10
+; AVX512BW-NEXT: addq %rbx, %r10
; AVX512BW-NEXT: adcq %r8, %rdi
; AVX512BW-NEXT: setb %al
; AVX512BW-NEXT: movzbl %al, %esi
-; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movq %rbp, %rax
+; AVX512BW-NEXT: mulq %r12
; AVX512BW-NEXT: addq %rdi, %rax
; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: addq %rbx, %rax
-; AVX512BW-NEXT: adcq %r11, %rdx
-; AVX512BW-NEXT: movq %r10, 8(%r12)
+; AVX512BW-NEXT: addq %r14, %rax
+; AVX512BW-NEXT: adcq %r15, %rdx
+; AVX512BW-NEXT: movq %r10, 8(%r13)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
; AVX512BW-NEXT: xorq %rax, %r10
@@ -4092,8 +4130,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq %rcx, 16(%r12)
-; AVX512BW-NEXT: movq %r14, (%r12)
+; AVX512BW-NEXT: movq %rcx, 16(%r13)
+; AVX512BW-NEXT: movq %r11, (%r13)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
; AVX512BW-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 4adc80b3b8bd6..508b0d7fe0f2b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -215,35 +215,36 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: subl $8, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: imull %edi, %esi
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %esi, %edx
-; WIN32-NEXT: movl %ebp, %esi
-; WIN32-NEXT: imull %ebp, %edi
-; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %ebx, %esi
; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %ebp
-; WIN32-NEXT: imull %ecx, %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %ebp, %edx
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl %ebx, %eax
+; WIN32-NEXT: movl %esi, %edi
+; WIN32-NEXT: andl %eax, %edi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %esi
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: subl %edi, %ecx
+; WIN32-NEXT: andl %ebp, %esi
+; WIN32-NEXT: subl %esi, %ecx
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: movl %ebp, %edi
+; WIN32-NEXT: andl %ebx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull %ebx
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: subl %edi, %esi
+; WIN32-NEXT: andl %ebx, %ebp
+; WIN32-NEXT: subl %ebp, %esi
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %ecx, %esi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: movl %ebx, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebx
@@ -262,7 +263,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; WIN32-NEXT: adcl %esi, %edx
; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
@@ -271,7 +272,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl %ebp, 4(%eax)
-; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: addl $8, %esp
@@ -573,49 +574,52 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %ebp, %edi
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: andl %eax, %edi
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: addl %edi, %edx
-; WIN32-NEXT: imull %ebx, %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %edi
-; WIN32-NEXT: imull %esi, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: addl %edi, %edx
-; WIN32-NEXT: movl %esi, %edi
-; WIN32-NEXT: imull %esi, %ebx
-; WIN32-NEXT: addl %edx, %ebx
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: subl %edi, %esi
+; WIN32-NEXT: andl %ebx, %ecx
+; WIN32-NEXT: subl %ecx, %esi
+; WIN32-NEXT: movl %ebx, %ecx
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: andl %ebp, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: subl %edi, %ebx
+; WIN32-NEXT: movl %ebp, %edi
+; WIN32-NEXT: andl %ebp, %ecx
+; WIN32-NEXT: subl %ecx, %ebx
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %ebx
+; WIN32-NEXT: adcl %esi, %ebx
; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %esi, %ebp
-; WIN32-NEXT: adcl $0, %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl %eax, %ecx
+; WIN32-NEXT: addl %esi, %ecx
+; WIN32-NEXT: adcl $0, %ebp
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %edi
; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ebp, %esi
+; WIN32-NEXT: addl %ecx, %esi
+; WIN32-NEXT: adcl %ebp, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: adcl %ecx, %edi
; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
@@ -999,30 +1003,32 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: imull %edi, %esi
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %esi, %edx
-; WIN32-NEXT: movl %ebx, %esi
-; WIN32-NEXT: imull %ebx, %edi
-; WIN32-NEXT: addl %edx, %edi
+; WIN32-NEXT: movl %ecx, %esi
+; WIN32-NEXT: movl %ecx, %ebp
; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %ebx
-; WIN32-NEXT: imull %ecx, %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %ebx, %edx
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
-; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movl %esi, %edi
+; WIN32-NEXT: andl %eax, %edi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %esi
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl %ecx, %edi
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: subl %edi, %ecx
+; WIN32-NEXT: andl %ebx, %esi
+; WIN32-NEXT: subl %esi, %ecx
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: andl %ebp, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: subl %edi, %esi
+; WIN32-NEXT: andl %ebp, %ebx
+; WIN32-NEXT: subl %ebx, %esi
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %ecx, %esi
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ebp, %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebx
@@ -1704,57 +1710,62 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $16, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %esi
-; WIN32-NEXT: movl 4(%eax), %ebp
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: movl %ebx, %ecx
-; WIN32-NEXT: imull %ebp, %ecx
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull %esi
+; WIN32-NEXT: movl 4(%eax), %eax
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %edi, %ecx
+; WIN32-NEXT: andl %eax, %ecx
+; WIN32-NEXT: movl %eax, %ebx
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: addl %ecx, %edx
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: subl %ecx, %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: imull %esi, %ebx
-; WIN32-NEXT: addl %edx, %ebx
-; WIN32-NEXT: movl %ebp, %ecx
-; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: andl %esi, %edi
+; WIN32-NEXT: subl %edi, %ebp
+; WIN32-NEXT: movl %ebx, %ecx
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull %ecx, %edi
+; WIN32-NEXT: movl %ecx, %ebx
+; WIN32-NEXT: andl %eax, %ebx
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %edi, %edx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: addl %edx, %ecx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: subl %ebx, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: andl %edx, %ecx
+; WIN32-NEXT: subl %ecx, %edi
+; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ebx, %ecx
+; WIN32-NEXT: adcl %ebp, %edi
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: addl %ebx, %edi
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; WIN32-NEXT: adcl $0, %ebp
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: adcl %ebp, %ebx
-; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT: addl %ebx, %esi
+; WIN32-NEXT: adcl %ebp, %ecx
+; WIN32-NEXT: setb %bl
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; WIN32-NEXT: adcl %edi, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: movzbl %bl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT: adcl %edi, %edx
; WIN32-NEXT: movl %esi, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
@@ -1762,7 +1773,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl %esi, 4(%eax)
-; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
; WIN32-NEXT: addl $16, %esp
@@ -1810,35 +1821,35 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $12, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %ebp
; WIN32-NEXT: movl 4(%eax), %ebx
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %edi
-; WIN32-NEXT: movl %ebp, %esi
-; WIN32-NEXT: imull %edi, %esi
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: movl %esi, %edi
+; WIN32-NEXT: andl %ebp, %edi
; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %edi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: addl %esi, %edx
-; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: subl %edi, %ecx
; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: imull %ebx, %edi
-; WIN32-NEXT: addl %edx, %edi
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: movl %esi, %ebx
-; WIN32-NEXT: imull %ecx, %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: addl %ebx, %edx
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: addl %edx, %esi
+; WIN32-NEXT: andl %ebx, %esi
+; WIN32-NEXT: subl %esi, %ecx
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: andl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %esi
+; WIN32-NEXT: subl %edi, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: andl %edx, %ebx
+; WIN32-NEXT: subl %ebx, %esi
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %edi, %esi
-; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: adcl %ecx, %esi
+; WIN32-NEXT: movl %edx, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %edx, %edi
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
More information about the llvm-commits
mailing list