[llvm] 0148df8 - [DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 11 16:21:21 PDT 2022


Author: Craig Topper
Date: 2022-10-11T16:20:55-07:00
New Revision: 0148df8157f05ecf3b1064508e6f012aefb87dad

URL: https://github.com/llvm/llvm-project/commit/0148df8157f05ecf3b1064508e6f012aefb87dad
DIFF: https://github.com/llvm/llvm-project/commit/0148df8157f05ecf3b1064508e6f012aefb87dad.diff

LOG: [DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))

(sra X, BW-1) is either 0 or -1. So the multiply is a conditional
negate of Y.

This pattern shows up when type legalizing wide multiplies involving
a sign extended value.

Fixes PR57549.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D133399

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    llvm/test/CodeGen/PowerPC/pr45448.ll
    llvm/test/CodeGen/RISCV/mul.ll
    llvm/test/CodeGen/RISCV/xaluo.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
    llvm/test/CodeGen/X86/extmul128.ll
    llvm/test/CodeGen/X86/muloti.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 195238eda3b92..131364e330232 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3939,6 +3939,30 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) {
   return SDValue();
 }
 
+// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
+static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1,
+                                  SelectionDAG &DAG) {
+  if (N0.getOpcode() != ISD::SRA)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // TODO: Use computeNumSignBits() == BitWidth?
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1));
+  if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1))
+    return SDValue();
+
+  // If optimizing for minsize, we don't want to increase the number of
+  // instructions.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1);
+  return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And);
+}
+
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4149,6 +4173,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     }
   }
 
+  if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG))
+    return V;
+  if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG))
+    return V;
+
   // reassociate mul
   if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
     return RMUL;

diff  --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index e955014371525..c01ec69629f30 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -39,21 +39,24 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
 ; AARCH:       // %bb.0: // %Entry
 ; AARCH-NEXT:    asr x9, x1, #63
 ; AARCH-NEXT:    asr x10, x3, #63
+; AARCH-NEXT:    and x11, x9, x2
+; AARCH-NEXT:    and x14, x10, x1
+; AARCH-NEXT:    umulh x12, x2, x9
+; AARCH-NEXT:    and x9, x9, x3
+; AARCH-NEXT:    umulh x13, x10, x0
+; AARCH-NEXT:    and x10, x10, x0
+; AARCH-NEXT:    sub x12, x12, x11
+; AARCH-NEXT:    neg x11, x11
+; AARCH-NEXT:    sub x13, x13, x14
+; AARCH-NEXT:    sub x9, x12, x9
+; AARCH-NEXT:    sub x12, x13, x10
+; AARCH-NEXT:    neg x10, x10
 ; AARCH-NEXT:    umulh x14, x0, x2
-; AARCH-NEXT:    mov x8, x1
-; AARCH-NEXT:    mul x11, x2, x9
-; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    umulh x12, x10, x0
-; AARCH-NEXT:    umulh x13, x2, x9
-; AARCH-NEXT:    madd x12, x10, x1, x12
-; AARCH-NEXT:    add x13, x13, x11
-; AARCH-NEXT:    mul x10, x10, x0
-; AARCH-NEXT:    madd x9, x3, x9, x13
-; AARCH-NEXT:    add x12, x12, x10
 ; AARCH-NEXT:    adds x10, x10, x11
 ; AARCH-NEXT:    mul x11, x1, x2
 ; AARCH-NEXT:    adc x9, x12, x9
 ; AARCH-NEXT:    umulh x13, x1, x2
+; AARCH-NEXT:    mov x8, x1
 ; AARCH-NEXT:    mul x12, x0, x3
 ; AARCH-NEXT:    adds x11, x11, x14
 ; AARCH-NEXT:    umulh x14, x0, x3
@@ -73,6 +76,7 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
 ; AARCH-NEXT:    eor x9, x9, x11
 ; AARCH-NEXT:    eor x10, x10, x11
 ; AARCH-NEXT:    orr x9, x10, x9
+; AARCH-NEXT:    str wzr, [x4]
 ; AARCH-NEXT:    cmp x9, #0
 ; AARCH-NEXT:    cset w9, ne
 ; AARCH-NEXT:    tbz x8, #63, .LBB1_2

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index f806149d0c395..bac0255ff1ce5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -159,24 +159,28 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
+; CI-NEXT:    v_ashrrev_i32_e32 v11, 31, v0
 ; CI-NEXT:    v_mov_b32_e32 v8, 0
-; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
-; CI-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
-; CI-NEXT:    v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
-; CI-NEXT:    v_mov_b32_e32 v7, v10
+; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8]
+; CI-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
+; CI-NEXT:    v_and_b32_e32 v14, v11, v1
+; CI-NEXT:    v_mov_b32_e32 v1, v10
 ; CI-NEXT:    v_mov_b32_e32 v10, v8
-; CI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
-; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
-; CI-NEXT:    v_add_i32_e32 v9, vcc, v7, v9
-; CI-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
-; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
-; CI-NEXT:    v_add_i32_e32 v7, vcc, v9, v0
-; CI-NEXT:    v_addc_u32_e32 v9, vcc, v10, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v1, v8
+; CI-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10]
+; CI-NEXT:    v_and_b32_e32 v13, v11, v12
+; CI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v14
+; CI-NEXT:    v_subb_u32_e32 v10, vcc, 0, v13, vcc
+; CI-NEXT:    v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10]
+; CI-NEXT:    v_mov_b32_e32 v0, v8
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CI-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1]
+; CI-NEXT:    v_add_i32_e32 v8, vcc, v0, v9
+; CI-NEXT:    v_addc_u32_e32 v9, vcc, v1, v10, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, v7
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v6, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; CI-NEXT:    v_addc_u32_e32 v2, vcc, v8, v4, vcc
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -184,60 +188,64 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
-; SI-NEXT:    v_mul_lo_u32 v11, v6, v1
-; SI-NEXT:    v_mul_hi_u32 v12, v0, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; SI-NEXT:    v_mul_hi_u32 v14, v6, v1
-; SI-NEXT:    v_mul_lo_u32 v13, v0, v7
-; SI-NEXT:    v_mul_hi_u32 v10, v0, v7
-; SI-NEXT:    v_add_i32_e32 v12, vcc, v11, v12
-; SI-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; SI-NEXT:    v_mul_hi_u32 v8, v6, v7
-; SI-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; SI-NEXT:    v_mul_i32_i24_e32 v9, v6, v7
-; SI-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; SI-NEXT:    v_mul_hi_i32 v6, v1, v6
-; SI-NEXT:    v_mul_hi_i32 v7, v7, v0
-; SI-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
-; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
-; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v11
+; SI-NEXT:    v_and_b32_e32 v9, v7, v0
+; SI-NEXT:    v_and_b32_e32 v10, v6, v1
+; SI-NEXT:    v_mul_lo_u32 v13, v6, v1
+; SI-NEXT:    v_mul_hi_u32 v14, v0, v1
+; SI-NEXT:    v_and_b32_e32 v8, v7, v6
+; SI-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; SI-NEXT:    v_mul_hi_u32 v10, v6, v7
+; SI-NEXT:    v_mul_i32_i24_e32 v11, v6, v7
+; SI-NEXT:    v_mul_hi_u32 v6, v6, v1
+; SI-NEXT:    v_mul_hi_u32 v12, v0, v7
+; SI-NEXT:    v_mul_lo_u32 v7, v0, v7
+; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v8, vcc
+; SI-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; SI-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
+; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; SI-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
+; SI-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
 ; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, v9, v10
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
+; SI-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v9
+; SI-NEXT:    v_subb_u32_e32 v8, vcc, v10, v8, vcc
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, v12, v3, vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v7, v3, vcc
+; SI-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v8, v5, vcc
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v11
-; GFX9-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v12, v11
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
-; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
-; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v7, v14, v15
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v6
+; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11]
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7]
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11]
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -246,27 +254,30 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v16, 31, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v17, 31, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8]
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
-; GFX11-NEXT:    v_mad_i64_i32 v[9:10], null, v1, v14, 0
-; GFX11-NEXT:    v_mov_b32_e32 v8, v12
+; GFX11-NEXT:    v_and_b32_e32 v8, v16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10]
+; GFX11-NEXT:    v_and_b32_e32 v9, v16, v17
+; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, 0, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
-; GFX11-NEXT:    v_add_co_u32 v7, s0, v7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, 0, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
+; GFX11-NEXT:    v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9]
+; GFX11-NEXT:    v_add_co_u32 v12, s0, v7, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v7, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, 0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13]
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)

diff  --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 0f8014df8adca..c3337c78a4770 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -25,7 +25,8 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK-NEXT:    rldic r5, r5, 4, 32
 ; CHECK-NEXT:    crnot 4*cr5+lt, eq
 ; CHECK-NEXT:    mulhdu r3, r3, r5
-; CHECK-NEXT:    maddld r6, r4, r5, r3
+; CHECK-NEXT:    and r6, r4, r5
+; CHECK-NEXT:    sub r6, r3, r6
 ; CHECK-NEXT:    cmpld cr1, r6, r3
 ; CHECK-NEXT:    mulhdu. r3, r4, r5
 ; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10

diff  --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 3923c4340d30e..986e799428e57 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1480,18 +1480,18 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32IM-NEXT:    add a5, a6, a2
 ; RV32IM-NEXT:    mul a7, a1, a3
 ; RV32IM-NEXT:    add t0, a7, a5
-; RV32IM-NEXT:    mul t1, a4, a0
-; RV32IM-NEXT:    add a2, t0, t1
+; RV32IM-NEXT:    and t1, a4, a0
+; RV32IM-NEXT:    sub a2, t0, t1
 ; RV32IM-NEXT:    sltu t2, a2, t0
 ; RV32IM-NEXT:    sltu a7, t0, a7
 ; RV32IM-NEXT:    sltu a5, a5, a6
 ; RV32IM-NEXT:    mulhu a3, a1, a3
 ; RV32IM-NEXT:    add a3, a3, a5
 ; RV32IM-NEXT:    add a3, a3, a7
-; RV32IM-NEXT:    mul a1, a4, a1
+; RV32IM-NEXT:    and a1, a4, a1
 ; RV32IM-NEXT:    mulhu a0, a4, a0
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    add a0, a0, t1
+; RV32IM-NEXT:    sub a0, a0, a1
+; RV32IM-NEXT:    sub a0, a0, t1
 ; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    add a1, a0, t2
 ; RV32IM-NEXT:    mv a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index f6963fd674d3e..f3391b2816495 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -961,8 +961,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
 ; RV32-NEXT:    mulhu a5, a0, a2
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    add a5, a6, a5
@@ -978,33 +980,34 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    mul t0, a1, a3
 ; RV32-NEXT:    add t1, t0, a7
 ; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    mul t3, a2, t2
+; RV32-NEXT:    and t3, t2, a2
 ; RV32-NEXT:    srai t4, a3, 31
-; RV32-NEXT:    mul t5, t4, a0
-; RV32-NEXT:    add t6, t5, t3
-; RV32-NEXT:    add s0, t1, t6
-; RV32-NEXT:    sltu s1, s0, t1
+; RV32-NEXT:    and t5, t4, a0
+; RV32-NEXT:    neg t6, t5
+; RV32-NEXT:    sub s0, t6, t3
+; RV32-NEXT:    add s1, t1, s0
+; RV32-NEXT:    sltu s2, s1, t1
 ; RV32-NEXT:    sltu t0, t1, t0
 ; RV32-NEXT:    sltu a6, a7, a6
 ; RV32-NEXT:    mulhu a7, a1, a3
 ; RV32-NEXT:    add a6, a7, a6
 ; RV32-NEXT:    add a6, a6, t0
 ; RV32-NEXT:    mulhu a7, a2, t2
-; RV32-NEXT:    add a7, a7, t3
-; RV32-NEXT:    mul a3, a3, t2
-; RV32-NEXT:    add a3, a7, a3
-; RV32-NEXT:    mul a1, t4, a1
+; RV32-NEXT:    sub a7, a7, t3
+; RV32-NEXT:    and a3, t2, a3
+; RV32-NEXT:    sub a3, a7, a3
+; RV32-NEXT:    and a1, t4, a1
 ; RV32-NEXT:    mulhu a7, t4, a0
-; RV32-NEXT:    add a1, a7, a1
-; RV32-NEXT:    add a1, a1, t5
+; RV32-NEXT:    sub a1, a7, a1
+; RV32-NEXT:    sub a1, a1, t5
 ; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    sltu a3, t6, t5
+; RV32-NEXT:    sltu a3, s0, t6
 ; RV32-NEXT:    add a1, a1, a3
 ; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    add a1, a1, s1
+; RV32-NEXT:    add a1, a1, s2
 ; RV32-NEXT:    srai a3, a5, 31
 ; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    xor a3, s0, a3
+; RV32-NEXT:    xor a3, s1, a3
 ; RV32-NEXT:    or a1, a3, a1
 ; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    mul a0, a0, a2
@@ -1013,6 +1016,7 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -1032,8 +1036,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    .cfi_offset s1, -8
+; RV32ZBA-NEXT:    .cfi_offset s2, -12
 ; RV32ZBA-NEXT:    mulhu a5, a0, a2
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    add a5, a6, a5
@@ -1049,33 +1055,34 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    mul t0, a1, a3
 ; RV32ZBA-NEXT:    add t1, t0, a7
 ; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    mul t3, a2, t2
+; RV32ZBA-NEXT:    and t3, t2, a2
 ; RV32ZBA-NEXT:    srai t4, a3, 31
-; RV32ZBA-NEXT:    mul t5, t4, a0
-; RV32ZBA-NEXT:    add t6, t5, t3
-; RV32ZBA-NEXT:    add s0, t1, t6
-; RV32ZBA-NEXT:    sltu s1, s0, t1
+; RV32ZBA-NEXT:    and t5, t4, a0
+; RV32ZBA-NEXT:    neg t6, t5
+; RV32ZBA-NEXT:    sub s0, t6, t3
+; RV32ZBA-NEXT:    add s1, t1, s0
+; RV32ZBA-NEXT:    sltu s2, s1, t1
 ; RV32ZBA-NEXT:    sltu t0, t1, t0
 ; RV32ZBA-NEXT:    sltu a6, a7, a6
 ; RV32ZBA-NEXT:    mulhu a7, a1, a3
 ; RV32ZBA-NEXT:    add a6, a7, a6
 ; RV32ZBA-NEXT:    add a6, a6, t0
 ; RV32ZBA-NEXT:    mulhu a7, a2, t2
-; RV32ZBA-NEXT:    add a7, a7, t3
-; RV32ZBA-NEXT:    mul a3, a3, t2
-; RV32ZBA-NEXT:    add a3, a7, a3
-; RV32ZBA-NEXT:    mul a1, t4, a1
+; RV32ZBA-NEXT:    sub a7, a7, t3
+; RV32ZBA-NEXT:    and a3, t2, a3
+; RV32ZBA-NEXT:    sub a3, a7, a3
+; RV32ZBA-NEXT:    and a1, t4, a1
 ; RV32ZBA-NEXT:    mulhu a7, t4, a0
-; RV32ZBA-NEXT:    add a1, a7, a1
-; RV32ZBA-NEXT:    add a1, a1, t5
+; RV32ZBA-NEXT:    sub a1, a7, a1
+; RV32ZBA-NEXT:    sub a1, a1, t5
 ; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    sltu a3, t6, t5
+; RV32ZBA-NEXT:    sltu a3, s0, t6
 ; RV32ZBA-NEXT:    add a1, a1, a3
 ; RV32ZBA-NEXT:    add a1, a6, a1
-; RV32ZBA-NEXT:    add a1, a1, s1
+; RV32ZBA-NEXT:    add a1, a1, s2
 ; RV32ZBA-NEXT:    srai a3, a5, 31
 ; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    xor a3, s0, a3
+; RV32ZBA-NEXT:    xor a3, s1, a3
 ; RV32ZBA-NEXT:    or a1, a3, a1
 ; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    mul a0, a0, a2
@@ -1084,6 +1091,7 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -1115,8 +1123,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    srai a1, a1, 31
-; RV32-NEXT:    mul a6, a1, a3
-; RV32-NEXT:    add a6, a5, a6
+; RV32-NEXT:    andi a6, a1, 13
+; RV32-NEXT:    sub a6, a5, a6
 ; RV32-NEXT:    srai a7, a4, 31
 ; RV32-NEXT:    xor t0, a6, a7
 ; RV32-NEXT:    sltu a5, a6, a5
@@ -1152,8 +1160,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    srai a1, a1, 31
-; RV32ZBA-NEXT:    mul a6, a1, a3
-; RV32ZBA-NEXT:    add a6, a5, a6
+; RV32ZBA-NEXT:    andi a6, a1, 13
+; RV32ZBA-NEXT:    sub a6, a5, a6
 ; RV32ZBA-NEXT:    srai a7, a4, 31
 ; RV32ZBA-NEXT:    xor t0, a6, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
@@ -2352,7 +2360,9 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -2368,33 +2378,34 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
+; RV32-NEXT:    and t2, t1, a2
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
+; RV32-NEXT:    and t4, t3, a0
+; RV32-NEXT:    neg t5, t4
+; RV32-NEXT:    sub t6, t5, t2
+; RV32-NEXT:    add s0, t0, t6
+; RV32-NEXT:    sltu s1, s0, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a6, a2, t1
-; RV32-NEXT:    add a6, a6, t2
-; RV32-NEXT:    mul a7, a3, t1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    mul a7, t3, a1
+; RV32-NEXT:    sub a6, a6, t2
+; RV32-NEXT:    and a7, t1, a3
+; RV32-NEXT:    sub a6, a6, a7
+; RV32-NEXT:    and a7, t3, a1
 ; RV32-NEXT:    mulhu t0, t3, a0
-; RV32-NEXT:    add a7, t0, a7
-; RV32-NEXT:    add a7, a7, t4
+; RV32-NEXT:    sub a7, t0, a7
+; RV32-NEXT:    sub a7, a7, t4
 ; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    sltu a7, t5, t4
+; RV32-NEXT:    sltu a7, t6, t5
 ; RV32-NEXT:    add a6, a6, a7
 ; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, a5, s0
+; RV32-NEXT:    add a5, a5, s1
 ; RV32-NEXT:    srai a4, a4, 31
 ; RV32-NEXT:    xor a5, a5, a4
-; RV32-NEXT:    xor a4, t6, a4
+; RV32-NEXT:    xor a4, s0, a4
 ; RV32-NEXT:    or a4, a4, a5
 ; RV32-NEXT:    bnez a4, .LBB46_2
 ; RV32-NEXT:  # %bb.1: # %entry
@@ -2402,6 +2413,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:  .LBB46_2: # %entry
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -2421,7 +2433,9 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
+; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -2437,33 +2451,34 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
+; RV32ZBA-NEXT:    and t2, t1, a2
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
+; RV32ZBA-NEXT:    and t4, t3, a0
+; RV32ZBA-NEXT:    neg t5, t4
+; RV32ZBA-NEXT:    sub t6, t5, t2
+; RV32ZBA-NEXT:    add s0, t0, t6
+; RV32ZBA-NEXT:    sltu s1, s0, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a6, a2, t1
-; RV32ZBA-NEXT:    add a6, a6, t2
-; RV32ZBA-NEXT:    mul a7, a3, t1
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    mul a7, t3, a1
+; RV32ZBA-NEXT:    sub a6, a6, t2
+; RV32ZBA-NEXT:    and a7, t1, a3
+; RV32ZBA-NEXT:    sub a6, a6, a7
+; RV32ZBA-NEXT:    and a7, t3, a1
 ; RV32ZBA-NEXT:    mulhu t0, t3, a0
-; RV32ZBA-NEXT:    add a7, t0, a7
-; RV32ZBA-NEXT:    add a7, a7, t4
+; RV32ZBA-NEXT:    sub a7, t0, a7
+; RV32ZBA-NEXT:    sub a7, a7, t4
 ; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    sltu a7, t5, t4
+; RV32ZBA-NEXT:    sltu a7, t6, t5
 ; RV32ZBA-NEXT:    add a6, a6, a7
 ; RV32ZBA-NEXT:    add a5, a5, a6
-; RV32ZBA-NEXT:    add a5, a5, s0
+; RV32ZBA-NEXT:    add a5, a5, s1
 ; RV32ZBA-NEXT:    srai a4, a4, 31
 ; RV32ZBA-NEXT:    xor a5, a5, a4
-; RV32ZBA-NEXT:    xor a4, t6, a4
+; RV32ZBA-NEXT:    xor a4, s0, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
 ; RV32ZBA-NEXT:    bnez a4, .LBB46_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
@@ -2471,6 +2486,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mv a1, a3
 ; RV32ZBA-NEXT:  .LBB46_2: # %entry
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2497,7 +2513,9 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -2513,36 +2531,38 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
+; RV32-NEXT:    and t2, t1, a2
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
+; RV32-NEXT:    and t4, t3, a0
+; RV32-NEXT:    neg t5, t4
+; RV32-NEXT:    sub t6, t5, t2
+; RV32-NEXT:    add s0, t0, t6
+; RV32-NEXT:    sltu s1, s0, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    add a2, a2, t2
-; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    mul a1, t3, a1
+; RV32-NEXT:    sub a2, a2, t2
+; RV32-NEXT:    and a3, t1, a3
+; RV32-NEXT:    sub a2, a2, a3
+; RV32-NEXT:    and a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    sub a0, a0, t4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t5, t4
+; RV32-NEXT:    sltu a1, t6, t5
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s0
+; RV32-NEXT:    add a0, a0, s1
 ; RV32-NEXT:    srai a1, a4, 31
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a1, t6, a1
+; RV32-NEXT:    xor a1, s0, a1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -2560,7 +2580,9 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
+; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -2576,36 +2598,38 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
+; RV32ZBA-NEXT:    and t2, t1, a2
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
+; RV32ZBA-NEXT:    and t4, t3, a0
+; RV32ZBA-NEXT:    neg t5, t4
+; RV32ZBA-NEXT:    sub t6, t5, t2
+; RV32ZBA-NEXT:    add s0, t0, t6
+; RV32ZBA-NEXT:    sltu s1, s0, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    add a2, a2, t2
-; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    mul a1, t3, a1
+; RV32ZBA-NEXT:    sub a2, a2, t2
+; RV32ZBA-NEXT:    and a3, t1, a3
+; RV32ZBA-NEXT:    sub a2, a2, a3
+; RV32ZBA-NEXT:    and a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, t4
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    sub a0, a0, t4
 ; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t5, t4
+; RV32ZBA-NEXT:    sltu a1, t6, t5
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s0
+; RV32ZBA-NEXT:    add a0, a0, s1
 ; RV32ZBA-NEXT:    srai a1, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    xor a1, t6, a1
+; RV32ZBA-NEXT:    xor a1, s0, a1
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    seqz a0, a0
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -3453,7 +3477,9 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -3469,33 +3495,34 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
+; RV32-NEXT:    and t2, t1, a2
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
+; RV32-NEXT:    and t4, t3, a0
+; RV32-NEXT:    neg t5, t4
+; RV32-NEXT:    sub t6, t5, t2
+; RV32-NEXT:    add s0, t0, t6
+; RV32-NEXT:    sltu s1, s0, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    add a2, a2, t2
-; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    mul a1, t3, a1
+; RV32-NEXT:    sub a2, a2, t2
+; RV32-NEXT:    and a3, t1, a3
+; RV32-NEXT:    sub a2, a2, a3
+; RV32-NEXT:    and a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t4
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    sub a0, a0, t4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t5, t4
+; RV32-NEXT:    sltu a1, t6, t5
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s0
+; RV32-NEXT:    add a0, a0, s1
 ; RV32-NEXT:    srai a1, a4, 31
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a1, t6, a1
+; RV32-NEXT:    xor a1, s0, a1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    beqz a0, .LBB61_2
 ; RV32-NEXT:  # %bb.1: # %overflow
@@ -3505,6 +3532,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:  .LBB61_3: # %overflow
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -3526,7 +3554,9 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
+; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -3542,33 +3572,34 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
+; RV32ZBA-NEXT:    and t2, t1, a2
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
+; RV32ZBA-NEXT:    and t4, t3, a0
+; RV32ZBA-NEXT:    neg t5, t4
+; RV32ZBA-NEXT:    sub t6, t5, t2
+; RV32ZBA-NEXT:    add s0, t0, t6
+; RV32ZBA-NEXT:    sltu s1, s0, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    add a2, a2, t2
-; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    mul a1, t3, a1
+; RV32ZBA-NEXT:    sub a2, a2, t2
+; RV32ZBA-NEXT:    and a3, t1, a3
+; RV32ZBA-NEXT:    sub a2, a2, a3
+; RV32ZBA-NEXT:    and a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, t4
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    sub a0, a0, t4
 ; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t5, t4
+; RV32ZBA-NEXT:    sltu a1, t6, t5
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s0
+; RV32ZBA-NEXT:    add a0, a0, s1
 ; RV32ZBA-NEXT:    srai a1, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    xor a1, t6, a1
+; RV32ZBA-NEXT:    xor a1, s0, a1
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    beqz a0, .LBB61_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
@@ -3578,6 +3609,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:  .LBB61_3: # %overflow
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -3625,8 +3657,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-NEXT:    add a6, a4, a6
 ; RV32-NEXT:    sub t1, a6, a1
 ; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    mul t3, t2, a2
-; RV32-NEXT:    sub t3, t3, a0
+; RV32-NEXT:    andi t3, t2, -13
+; RV32-NEXT:    sub t3, a5, t3
 ; RV32-NEXT:    add t4, t1, t3
 ; RV32-NEXT:    sltu t5, t4, t1
 ; RV32-NEXT:    neg t6, a1
@@ -3687,8 +3719,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32ZBA-NEXT:    add a6, a4, a6
 ; RV32ZBA-NEXT:    sub t1, a6, a1
 ; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    mul t3, t2, a2
-; RV32ZBA-NEXT:    sub t3, t3, a0
+; RV32ZBA-NEXT:    andi t3, t2, -13
+; RV32ZBA-NEXT:    sub t3, a5, t3
 ; RV32ZBA-NEXT:    add t4, t1, t3
 ; RV32ZBA-NEXT:    sltu t5, t4, t1
 ; RV32ZBA-NEXT:    neg t6, a1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 217caeebe6335..9cb0ec4d98fb5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -38,22 +38,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0246_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    umull r2, r4, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    asrs r2, r0, #31
-; CHECK-NEXT:    mla r4, r1, r2, r12
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    mla r2, r3, r2, r5
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    mla r0, r3, r0, r2
+; CHECK-NEXT:    and.w r2, r1, r0, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r0, r1, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r3, r0, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r0, r0, r3, asr #31
+; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -67,22 +68,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_0246:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull lr, r12, r0, r1
-; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    umull r2, r4, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    mla r2, r0, r2, r12
-; CHECK-NEXT:    mla r1, r4, r1, r2
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    mla r0, r0, r2, r5
-; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    and.w r2, r0, r1, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r1, r0, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r0, r3, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r0, r3, r0, asr #31
+; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -130,23 +132,24 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_1357_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vrev64.32 q1, q0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    umull r2, r4, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    asrs r2, r0, #31
-; CHECK-NEXT:    mla r4, r1, r2, r12
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    mla r2, r3, r2, r5
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    mla r0, r3, r0, r2
+; CHECK-NEXT:    and.w r2, r1, r0, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r0, r1, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r3, r0, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r0, r0, r3, asr #31
+; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -160,23 +163,24 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_1357:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vrev64.32 q1, q0
-; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r0, r1
-; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    umull r2, r4, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    mla r2, r0, r2, r12
-; CHECK-NEXT:    mla r1, r4, r1, r2
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    mla r0, r0, r2, r5
-; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    and.w r2, r0, r1, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r1, r0, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r0, r3, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r0, r3, r0, asr #31
+; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -230,36 +234,39 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0213_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    umull r2, r5, r3, r0
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r1, r0
+; CHECK-NEXT:    umull r2, r4, r3, r0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT:    asrs r2, r0, #31
-; CHECK-NEXT:    mla r4, r1, r2, r12
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    mla r5, r3, r2, r5
-; CHECK-NEXT:    asrs r3, r3, #31
-; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    mla r3, r3, r0, r5
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
+; CHECK-NEXT:    and.w r2, r1, r0, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r0, r1, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r3, r0, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r3, r0, r3, asr #31
+; CHECK-NEXT:    subs r2, r2, r3
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    umull r3, r5, r1, r0
-; CHECK-NEXT:    mla r5, r1, r2, r5
-; CHECK-NEXT:    asrs r1, r1, #31
-; CHECK-NEXT:    mla r12, r1, r0, r5
-; CHECK-NEXT:    vmov r5, s0
-; CHECK-NEXT:    umull r4, r1, r5, r0
-; CHECK-NEXT:    mla r1, r5, r2, r1
-; CHECK-NEXT:    asrs r2, r5, #31
+; CHECK-NEXT:    and.w r2, r1, r0, asr #31
+; CHECK-NEXT:    umull r3, r4, r1, r0
+; CHECK-NEXT:    and.w r1, r0, r1, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    sub.w r12, r2, r1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    umull r4, r1, r2, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    mla r0, r2, r0, r1
+; CHECK-NEXT:    and.w r3, r2, r0, asr #31
+; CHECK-NEXT:    and.w r0, r0, r2, asr #31
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    subs r0, r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   %out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -273,36 +280,39 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    asrs r4, r0, #31
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    umull r2, r5, r0, r3
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r0, r1
+; CHECK-NEXT:    umull r2, r4, r0, r3
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT:    asrs r2, r1, #31
-; CHECK-NEXT:    mla r2, r0, r2, r12
-; CHECK-NEXT:    mla r1, r4, r1, r2
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    mla r2, r0, r2, r5
-; CHECK-NEXT:    mla r2, r4, r3, r2
+; CHECK-NEXT:    and.w r2, r0, r1, asr #31
+; CHECK-NEXT:    sub.w r2, r12, r2
+; CHECK-NEXT:    and.w r1, r1, r0, asr #31
+; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    and.w r2, r0, r3, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    and.w r3, r3, r0, asr #31
+; CHECK-NEXT:    subs r2, r2, r3
 ; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    umull r2, r3, r0, r1
-; CHECK-NEXT:    asrs r5, r1, #31
-; CHECK-NEXT:    mla r3, r0, r5, r3
-; CHECK-NEXT:    mla r12, r4, r1, r3
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    umull r5, r1, r0, r3
-; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
-; CHECK-NEXT:    asrs r2, r3, #31
-; CHECK-NEXT:    mla r0, r0, r2, r1
-; CHECK-NEXT:    mla r0, r4, r3, r0
+; CHECK-NEXT:    umull r3, r4, r0, r1
+; CHECK-NEXT:    and.w r2, r0, r1, asr #31
+; CHECK-NEXT:    and.w r1, r1, r0, asr #31
+; CHECK-NEXT:    subs r2, r4, r2
+; CHECK-NEXT:    sub.w r12, r2, r1
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    umull r4, r1, r0, r2
+; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
+; CHECK-NEXT:    and.w r3, r0, r2, asr #31
+; CHECK-NEXT:    and.w r0, r2, r0, asr #31
+; CHECK-NEXT:    subs r1, r1, r3
+; CHECK-NEXT:    subs r0, r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   %out1 = sext <4 x i32> %shuf1 to <4 x i64>

diff  --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll
index a7f2959a23c2c..a2d8211888618 100644
--- a/llvm/test/CodeGen/X86/extmul128.ll
+++ b/llvm/test/CodeGen/X86/extmul128.ll
@@ -29,8 +29,8 @@ define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %rsi
 ; CHECK-NEXT:    sarq $63, %rsi
-; CHECK-NEXT:    imulq %rdi, %rsi
-; CHECK-NEXT:    addq %rsi, %rdx
+; CHECK-NEXT:    andq %rdi, %rsi
+; CHECK-NEXT:    subq %rsi, %rdx
 ; CHECK-NEXT:    retq
   %aa = zext i64 %a to i128
   %bb = sext i64 %b to i128
@@ -45,6 +45,37 @@ define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rdi, %rcx
 ; CHECK-NEXT:    sarq $63, %rcx
 ; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    subq %rcx, %rdx
+; CHECK-NEXT:    retq
+  %aa = sext i64 %a to i128
+  %bb = zext i64 %b to i128
+  %cc = mul i128 %aa, %bb
+  ret i128 %cc
+}
+
+define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
+; CHECK-LABEL: i64_zext_sext_i128_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    sarq $63, %rsi
+; CHECK-NEXT:    imulq %rdi, %rsi
+; CHECK-NEXT:    addq %rsi, %rdx
+; CHECK-NEXT:    retq
+  %aa = zext i64 %a to i128
+  %bb = sext i64 %b to i128
+  %cc = mul i128 %aa, %bb
+  ret i128 %cc
+}
+
+define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize {
+; CHECK-LABEL: i64_sext_zext_i128_minsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    sarq $63, %rcx
+; CHECK-NEXT:    mulq %rsi
 ; CHECK-NEXT:    imulq %rsi, %rcx
 ; CHECK-NEXT:    addq %rcx, %rdx
 ; CHECK-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 9a6cf0b065662..3733306f354a5 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -7,34 +7,39 @@
 define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
 ; CHECK-LABEL: x:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %rbx, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %r15, -16
 ; CHECK-NEXT:    movq %rdx, %r11
 ; CHECK-NEXT:    movq %rdi, %r9
-; CHECK-NEXT:    movq %rsi, %rbx
-; CHECK-NEXT:    sarq $63, %rbx
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    imulq %rbx, %rdi
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    sarq $63, %rdi
+; CHECK-NEXT:    movq %rdi, %r10
+; CHECK-NEXT:    andq %rdx, %r10
 ; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    mulq %rbx
+; CHECK-NEXT:    mulq %rdi
 ; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    addq %rdi, %rdx
-; CHECK-NEXT:    imulq %rcx, %rbx
-; CHECK-NEXT:    addq %rdx, %rbx
-; CHECK-NEXT:    movq %rcx, %rdi
-; CHECK-NEXT:    sarq $63, %rdi
-; CHECK-NEXT:    movq %rdi, %r14
-; CHECK-NEXT:    imulq %rsi, %r14
-; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdx, %rbx
+; CHECK-NEXT:    subq %r10, %rbx
+; CHECK-NEXT:    andq %rcx, %rdi
+; CHECK-NEXT:    subq %rdi, %rbx
+; CHECK-NEXT:    movq %rcx, %r14
+; CHECK-NEXT:    sarq $63, %r14
+; CHECK-NEXT:    movq %r14, %r15
+; CHECK-NEXT:    andq %rsi, %r15
+; CHECK-NEXT:    movq %r14, %rax
 ; CHECK-NEXT:    mulq %r9
 ; CHECK-NEXT:    movq %rax, %r10
-; CHECK-NEXT:    addq %r14, %rdx
-; CHECK-NEXT:    imulq %r9, %rdi
-; CHECK-NEXT:    addq %rdx, %rdi
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    subq %r15, %rdi
+; CHECK-NEXT:    andq %r9, %r14
+; CHECK-NEXT:    subq %r14, %rdi
 ; CHECK-NEXT:    addq %r8, %r10
 ; CHECK-NEXT:    adcq %rbx, %rdi
 ; CHECK-NEXT:    movq %r9, %rax
@@ -72,6 +77,7 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    movq %r9, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB0_1: ## %overflow
 ; CHECK-NEXT:    ud2

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 996601ed3be64..07debb11b92f7 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 32
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -378,52 +378,54 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl %eax, %ebx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl %ebx, %esi
+; X86-NEXT:    andl %ebp, %edi
+; X86-NEXT:    subl %edi, %esi
 ; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %ecx, %ebp
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %ebp, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %edx, %edi
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    xorl %edi, %edx
@@ -434,11 +436,11 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    notl %ecx
-; X86-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel %ebp, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 367ca660cda14..6631c6c4cc014 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -9,39 +9,44 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r12
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r12, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    imulq %r14, %rdi
+; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    sarq $63, %r9
+; X64-NEXT:    movq %r9, %r11
+; X64-NEXT:    andq %rdx, %r11
 ; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    imulq %rcx, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    movq %rdi, %r15
-; X64-NEXT:    imulq %rsi, %r15
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    subq %r11, %r14
+; X64-NEXT:    andq %rcx, %r9
+; X64-NEXT:    subq %r9, %r14
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    andq %rsi, %r12
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %r15, %rdx
-; X64-NEXT:    imulq %r10, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    adcq %r14, %rdi
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    subq %r12, %r9
+; X64-NEXT:    andq %r10, %r15
+; X64-NEXT:    subq %r15, %r9
+; X64-NEXT:    addq %rdi, %r11
+; X64-NEXT:    adcq %r14, %r9
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
@@ -61,15 +66,16 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
 ; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    adcq %r9, %rdx
 ; X64-NEXT:    movq %r10, 8(%r8)
 ; X64-NEXT:    sarq $63, %r10
 ; X64-NEXT:    xorq %r10, %rdx
 ; X64-NEXT:    xorq %rax, %r10
 ; X64-NEXT:    orq %rdx, %r10
 ; X64-NEXT:    setne %al
-; X64-NEXT:    movq %r9, (%r8)
+; X64-NEXT:    movq %rdi, (%r8)
 ; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    retq
@@ -84,8 +90,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $56, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 76
+; X86-NEXT:    subl $60, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 80
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -99,226 +105,229 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl %edi, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    imull %esi, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    subl %esi, %edi
+; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    subl %ebx, %edi
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    subl %esi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl %esi, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl (%esp), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl %esi, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebx, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -326,7 +335,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -360,234 +369,239 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rsi, %r15
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq %rdi, %r11
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rsi, %r10
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r10, %r14
-; X64-NEXT:    adcq %rcx, %r12
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rdi, %rbx
+; X64-NEXT:    adcq %rsi, %r12
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movzbl %al, %edi
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r12, %rbx
-; X64-NEXT:    adcq %rcx, %r11
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r12, %rsi
+; X64-NEXT:    adcq %rdi, %rdx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r8, %rdi
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r8, %r13
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %r10
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r15, %r9
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    adcq %r12, %rdi
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r10, %r8
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    addq %rdi, %r8
+; X64-NEXT:    movzbl %r9b, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %rbp
-; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    adcq %rbx, %rbp
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %r15, %r12
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r10, %rcx
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r10, %r9
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %r10, %r15
 ; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r11, %r13
-; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r10
-; X64-NEXT:    addq %r8, %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rbp, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    adcq %rax, %rdi
+; X64-NEXT:    addq %r8, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rbp, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    addq %rbx, %r13
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    addq %rsi, %r13
+; X64-NEXT:    adcq %r12, %rdi
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    adcq %rdi, %r9
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rbx, %rsi
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq %r13, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r10, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r14
+; X64-NEXT:    adcq %rax, %rbp
+; X64-NEXT:    addq %r13, %r15
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rdi, %r8
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    adcq %rax, %rbx
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rbx, %r13
-; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    sarq $63, %r13
 ; X64-NEXT:    movq %r13, %rcx
-; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    andq %r9, %rcx
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    imulq %r13, %r15
-; X64-NEXT:    addq %rdx, %r15
-; X64-NEXT:    movq %r13, %rcx
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    subq %rcx, %r10
+; X64-NEXT:    andq %r13, %r14
+; X64-NEXT:    subq %r14, %r10
+; X64-NEXT:    movq %r13, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    andq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    imulq %rdi, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    imulq %r13, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    subq %rsi, %rcx
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    subq %rdi, %rcx
+; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    adcq %r10, %rcx
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r9, %r15
+; X64-NEXT:    addq %r11, %r15
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    addq %rcx, %r15
-; X64-NEXT:    adcq %r9, %r13
-; X64-NEXT:    setb %cl
+; X64-NEXT:    addq %rsi, %r15
+; X64-NEXT:    adcq %r11, %r13
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    movzbl %cl, %r9d
-; X64-NEXT:    adcq %rdx, %r9
+; X64-NEXT:    movzbl %sil, %esi
+; X64-NEXT:    adcq %rdx, %rsi
 ; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %rsi, %r9
-; X64-NEXT:    sarq $63, %r12
+; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:    sarq $63, %r9
+; X64-NEXT:    movq %r9, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    imulq %r12, %r8
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    andq %rax, %r8
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    subq %r8, %r14
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    imulq %r12, %rbx
-; X64-NEXT:    addq %r8, %rbx
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    subq %rax, %r14
+; X64-NEXT:    movq %r9, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r12, %rcx
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    imulq %r12, %r10
-; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    adcq %rbx, %r10
-; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    andq %rax, %r12
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    subq %r12, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; X64-NEXT:    andq %r9, %rax
+; X64-NEXT:    subq %rax, %r8
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    adcq %r14, %r8
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    addq %r11, %r14
 ; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    addq %rax, %rbx
+; X64-NEXT:    addq %rax, %r14
 ; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    setb %cl
+; X64-NEXT:    setb %r9b
 ; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movzbl %r9b, %eax
 ; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %r8, %r11
-; X64-NEXT:    adcq %r10, %rax
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %rbx
+; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    adcq %r8, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, %r14
 ; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    adcq %r9, %rax
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    adcq %rsi, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
+; X64-NEXT:    adcq %rbx, %r11
 ; X64-NEXT:    adcq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    xorq %rcx, %rbx
-; X64-NEXT:    orq %rax, %rbx
-; X64-NEXT:    xorq %rcx, %r11
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    orq %r11, %rcx
-; X64-NEXT:    orq %rbx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    xorq %rdx, %rax
+; X64-NEXT:    xorq %rdx, %r14
+; X64-NEXT:    orq %rax, %r14
+; X64-NEXT:    xorq %rdx, %r11
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    orq %r11, %rdx
+; X64-NEXT:    orq %r14, %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rdx, 24(%rax)
+; X64-NEXT:    movq %rsi, 24(%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -613,400 +627,399 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $156, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 176
+; X86-NEXT:    subl $152, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 172
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
@@ -1019,9 +1032,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
@@ -1034,41 +1047,13 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
@@ -1077,89 +1062,117 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1175,25 +1188,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
@@ -1201,76 +1214,75 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %al, %edx
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    andl %eax, %ebx
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
@@ -1280,263 +1292,266 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    subl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    andl %edi, %edx
+; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    subl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %bl, %ebp
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    addl %ebp, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %dl, %eax
-; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movzbl %dl, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    andl %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    subl %esi, %ebx
+; X86-NEXT:    andl %ebp, %ecx
+; X86-NEXT:    subl %ecx, %ebx
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    subl %ecx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %edi, %edx
-; X86-NEXT:    xorl %edi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    xorl %edi, %ebp
-; X86-NEXT:    orl %eax, %ebp
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 28(%eax)
+; X86-NEXT:    movl %ebp, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1552,7 +1567,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $156, %esp
+; X86-NEXT:    addl $152, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index dbec86755a969..641663d9eedfe 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,31 +3297,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    movq %r8, %r14
+; SSE2-NEXT:    movq %rcx, %rbp
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rsi, %r11
 ; SSE2-NEXT:    movq %rdi, %r10
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    movq %r11, %r12
-; SSE2-NEXT:    sarq $63, %r12
-; SSE2-NEXT:    movq %r14, %rbx
-; SSE2-NEXT:    imulq %r12, %rbx
+; SSE2-NEXT:    movq %r11, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %rbx, %r15
+; SSE2-NEXT:    andq %r14, %r15
 ; SSE2-NEXT:    movq %r14, %rax
-; SSE2-NEXT:    mulq %r12
+; SSE2-NEXT:    mulq %rbx
 ; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    addq %rbx, %rdx
-; SSE2-NEXT:    imulq %r9, %r12
-; SSE2-NEXT:    addq %rdx, %r12
-; SSE2-NEXT:    movq %r9, %rbx
-; SSE2-NEXT:    sarq $63, %rbx
-; SSE2-NEXT:    movq %rbx, %r13
-; SSE2-NEXT:    imulq %r11, %r13
-; SSE2-NEXT:    movq %rbx, %rax
+; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    subq %r15, %r12
+; SSE2-NEXT:    andq %r9, %rbx
+; SSE2-NEXT:    subq %rbx, %r12
+; SSE2-NEXT:    movq %r9, %r13
+; SSE2-NEXT:    sarq $63, %r13
+; SSE2-NEXT:    movq %r13, %rcx
+; SSE2-NEXT:    andq %r11, %rcx
+; SSE2-NEXT:    movq %r13, %rax
 ; SSE2-NEXT:    mulq %r10
 ; SSE2-NEXT:    movq %rax, %r15
-; SSE2-NEXT:    addq %r13, %rdx
-; SSE2-NEXT:    imulq %r10, %rbx
-; SSE2-NEXT:    addq %rdx, %rbx
+; SSE2-NEXT:    movq %rdx, %rbx
+; SSE2-NEXT:    subq %rcx, %rbx
+; SSE2-NEXT:    andq %r10, %r13
+; SSE2-NEXT:    subq %r13, %rbx
 ; SSE2-NEXT:    addq %rdi, %r15
 ; SSE2-NEXT:    adcq %r12, %rbx
 ; SSE2-NEXT:    movq %r10, %rax
@@ -3341,11 +3343,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    addq %r13, %r10
 ; SSE2-NEXT:    adcq %r14, %r12
 ; SSE2-NEXT:    setb %al
-; SSE2-NEXT:    movzbl %al, %r14d
+; SSE2-NEXT:    movzbl %al, %ecx
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r9
 ; SSE2-NEXT:    addq %r12, %rax
-; SSE2-NEXT:    adcq %r14, %rdx
+; SSE2-NEXT:    adcq %rcx, %rdx
 ; SSE2-NEXT:    addq %r15, %rax
 ; SSE2-NEXT:    adcq %rbx, %rdx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3356,52 +3358,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    xorl %r15d, %r15d
 ; SSE2-NEXT:    orq %rdx, %r10
 ; SSE2-NEXT:    setne %r15b
-; SSE2-NEXT:    movq %rcx, %rbx
-; SSE2-NEXT:    sarq $63, %rbx
-; SSE2-NEXT:    movq %rsi, %r10
-; SSE2-NEXT:    imulq %rbx, %r10
+; SSE2-NEXT:    movq %rbp, %rcx
+; SSE2-NEXT:    sarq $63, %rcx
+; SSE2-NEXT:    movq %rcx, %r11
+; SSE2-NEXT:    andq %rsi, %r11
 ; SSE2-NEXT:    movq %rsi, %rax
-; SSE2-NEXT:    mulq %rbx
+; SSE2-NEXT:    mulq %rcx
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %r10, %rdx
-; SSE2-NEXT:    imulq %rbp, %rbx
-; SSE2-NEXT:    addq %rdx, %rbx
-; SSE2-NEXT:    movq %rbp, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    movq %r10, %r14
-; SSE2-NEXT:    imulq %rcx, %r14
-; SSE2-NEXT:    movq %r10, %rax
-; SSE2-NEXT:    mulq %r8
+; SSE2-NEXT:    movq %rdx, %r10
+; SSE2-NEXT:    subq %r11, %r10
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andq %rax, %rcx
+; SSE2-NEXT:    subq %rcx, %r10
 ; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    addq %r14, %rdx
-; SSE2-NEXT:    imulq %r8, %r10
-; SSE2-NEXT:    addq %rdx, %r10
-; SSE2-NEXT:    addq %r9, %r11
-; SSE2-NEXT:    adcq %rbx, %r10
+; SSE2-NEXT:    movq %rax, %r13
+; SSE2-NEXT:    sarq $63, %r11
+; SSE2-NEXT:    movq %r11, %rcx
+; SSE2-NEXT:    andq %rbp, %rcx
+; SSE2-NEXT:    movq %r11, %rax
+; SSE2-NEXT:    mulq %r8
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    movq %rdx, %r14
+; SSE2-NEXT:    subq %rcx, %r14
+; SSE2-NEXT:    andq %r8, %r11
+; SSE2-NEXT:    subq %r11, %r14
+; SSE2-NEXT:    addq %r9, %rbx
+; SSE2-NEXT:    adcq %r10, %r14
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %r9
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    movq %rax, %r10
+; SSE2-NEXT:    movq %rbp, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    movq %rax, %r14
-; SSE2-NEXT:    addq %r9, %r14
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    addq %r9, %r11
 ; SSE2-NEXT:    adcq $0, %rsi
 ; SSE2-NEXT:    movq %r8, %rax
-; SSE2-NEXT:    mulq %rbp
+; SSE2-NEXT:    mulq %r13
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %r14, %r9
+; SSE2-NEXT:    addq %r11, %r9
 ; SSE2-NEXT:    adcq %rsi, %r8
 ; SSE2-NEXT:    setb %al
-; SSE2-NEXT:    movzbl %al, %esi
-; SSE2-NEXT:    movq %rcx, %rax
-; SSE2-NEXT:    mulq %rbp
+; SSE2-NEXT:    movzbl %al, %ecx
+; SSE2-NEXT:    movq %rbp, %rax
+; SSE2-NEXT:    mulq %r13
 ; SSE2-NEXT:    addq %r8, %rax
-; SSE2-NEXT:    adcq %rsi, %rdx
-; SSE2-NEXT:    addq %r11, %rax
-; SSE2-NEXT:    adcq %r10, %rdx
+; SSE2-NEXT:    adcq %rcx, %rdx
+; SSE2-NEXT:    addq %rbx, %rax
+; SSE2-NEXT:    adcq %r14, %rdx
 ; SSE2-NEXT:    movq %r9, 24(%r12)
 ; SSE2-NEXT:    sarq $63, %r9
 ; SSE2-NEXT:    xorq %r9, %rdx
@@ -3414,7 +3420,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    negl %r15d
 ; SSE2-NEXT:    movd %r15d, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %rbx, 16(%r12)
+; SSE2-NEXT:    movq %r10, 16(%r12)
 ; SSE2-NEXT:    movq %rdi, (%r12)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
@@ -3433,31 +3439,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    pushq %r12
 ; SSSE3-NEXT:    pushq %rbx
 ; SSSE3-NEXT:    movq %r8, %r14
+; SSSE3-NEXT:    movq %rcx, %rbp
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rsi, %r11
 ; SSSE3-NEXT:    movq %rdi, %r10
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT:    movq %r11, %r12
-; SSSE3-NEXT:    sarq $63, %r12
-; SSSE3-NEXT:    movq %r14, %rbx
-; SSSE3-NEXT:    imulq %r12, %rbx
+; SSSE3-NEXT:    movq %r11, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movq %rbx, %r15
+; SSSE3-NEXT:    andq %r14, %r15
 ; SSSE3-NEXT:    movq %r14, %rax
-; SSSE3-NEXT:    mulq %r12
+; SSSE3-NEXT:    mulq %rbx
 ; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    addq %rbx, %rdx
-; SSSE3-NEXT:    imulq %r9, %r12
-; SSSE3-NEXT:    addq %rdx, %r12
-; SSSE3-NEXT:    movq %r9, %rbx
-; SSSE3-NEXT:    sarq $63, %rbx
-; SSSE3-NEXT:    movq %rbx, %r13
-; SSSE3-NEXT:    imulq %r11, %r13
-; SSSE3-NEXT:    movq %rbx, %rax
+; SSSE3-NEXT:    movq %rdx, %r12
+; SSSE3-NEXT:    subq %r15, %r12
+; SSSE3-NEXT:    andq %r9, %rbx
+; SSSE3-NEXT:    subq %rbx, %r12
+; SSSE3-NEXT:    movq %r9, %r13
+; SSSE3-NEXT:    sarq $63, %r13
+; SSSE3-NEXT:    movq %r13, %rcx
+; SSSE3-NEXT:    andq %r11, %rcx
+; SSSE3-NEXT:    movq %r13, %rax
 ; SSSE3-NEXT:    mulq %r10
 ; SSSE3-NEXT:    movq %rax, %r15
-; SSSE3-NEXT:    addq %r13, %rdx
-; SSSE3-NEXT:    imulq %r10, %rbx
-; SSSE3-NEXT:    addq %rdx, %rbx
+; SSSE3-NEXT:    movq %rdx, %rbx
+; SSSE3-NEXT:    subq %rcx, %rbx
+; SSSE3-NEXT:    andq %r10, %r13
+; SSSE3-NEXT:    subq %r13, %rbx
 ; SSSE3-NEXT:    addq %rdi, %r15
 ; SSSE3-NEXT:    adcq %r12, %rbx
 ; SSSE3-NEXT:    movq %r10, %rax
@@ -3477,11 +3485,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    addq %r13, %r10
 ; SSSE3-NEXT:    adcq %r14, %r12
 ; SSSE3-NEXT:    setb %al
-; SSSE3-NEXT:    movzbl %al, %r14d
+; SSSE3-NEXT:    movzbl %al, %ecx
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r9
 ; SSSE3-NEXT:    addq %r12, %rax
-; SSSE3-NEXT:    adcq %r14, %rdx
+; SSSE3-NEXT:    adcq %rcx, %rdx
 ; SSSE3-NEXT:    addq %r15, %rax
 ; SSSE3-NEXT:    adcq %rbx, %rdx
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3492,52 +3500,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    xorl %r15d, %r15d
 ; SSSE3-NEXT:    orq %rdx, %r10
 ; SSSE3-NEXT:    setne %r15b
-; SSSE3-NEXT:    movq %rcx, %rbx
-; SSSE3-NEXT:    sarq $63, %rbx
-; SSSE3-NEXT:    movq %rsi, %r10
-; SSSE3-NEXT:    imulq %rbx, %r10
+; SSSE3-NEXT:    movq %rbp, %rcx
+; SSSE3-NEXT:    sarq $63, %rcx
+; SSSE3-NEXT:    movq %rcx, %r11
+; SSSE3-NEXT:    andq %rsi, %r11
 ; SSSE3-NEXT:    movq %rsi, %rax
-; SSSE3-NEXT:    mulq %rbx
+; SSSE3-NEXT:    mulq %rcx
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %r10, %rdx
-; SSSE3-NEXT:    imulq %rbp, %rbx
-; SSSE3-NEXT:    addq %rdx, %rbx
-; SSSE3-NEXT:    movq %rbp, %r10
-; SSSE3-NEXT:    sarq $63, %r10
-; SSSE3-NEXT:    movq %r10, %r14
-; SSSE3-NEXT:    imulq %rcx, %r14
-; SSSE3-NEXT:    movq %r10, %rax
-; SSSE3-NEXT:    mulq %r8
+; SSSE3-NEXT:    movq %rdx, %r10
+; SSSE3-NEXT:    subq %r11, %r10
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT:    andq %rax, %rcx
+; SSSE3-NEXT:    subq %rcx, %r10
 ; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    addq %r14, %rdx
-; SSSE3-NEXT:    imulq %r8, %r10
-; SSSE3-NEXT:    addq %rdx, %r10
-; SSSE3-NEXT:    addq %r9, %r11
-; SSSE3-NEXT:    adcq %rbx, %r10
+; SSSE3-NEXT:    movq %rax, %r13
+; SSSE3-NEXT:    sarq $63, %r11
+; SSSE3-NEXT:    movq %r11, %rcx
+; SSSE3-NEXT:    andq %rbp, %rcx
+; SSSE3-NEXT:    movq %r11, %rax
+; SSSE3-NEXT:    mulq %r8
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    movq %rdx, %r14
+; SSSE3-NEXT:    subq %rcx, %r14
+; SSSE3-NEXT:    andq %r8, %r11
+; SSSE3-NEXT:    subq %r11, %r14
+; SSSE3-NEXT:    addq %r9, %rbx
+; SSSE3-NEXT:    adcq %r10, %r14
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %r9
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    movq %rax, %r10
+; SSSE3-NEXT:    movq %rbp, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    movq %rax, %r14
-; SSSE3-NEXT:    addq %r9, %r14
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    addq %r9, %r11
 ; SSSE3-NEXT:    adcq $0, %rsi
 ; SSSE3-NEXT:    movq %r8, %rax
-; SSSE3-NEXT:    mulq %rbp
+; SSSE3-NEXT:    mulq %r13
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %r14, %r9
+; SSSE3-NEXT:    addq %r11, %r9
 ; SSSE3-NEXT:    adcq %rsi, %r8
 ; SSSE3-NEXT:    setb %al
-; SSSE3-NEXT:    movzbl %al, %esi
-; SSSE3-NEXT:    movq %rcx, %rax
-; SSSE3-NEXT:    mulq %rbp
+; SSSE3-NEXT:    movzbl %al, %ecx
+; SSSE3-NEXT:    movq %rbp, %rax
+; SSSE3-NEXT:    mulq %r13
 ; SSSE3-NEXT:    addq %r8, %rax
-; SSSE3-NEXT:    adcq %rsi, %rdx
-; SSSE3-NEXT:    addq %r11, %rax
-; SSSE3-NEXT:    adcq %r10, %rdx
+; SSSE3-NEXT:    adcq %rcx, %rdx
+; SSSE3-NEXT:    addq %rbx, %rax
+; SSSE3-NEXT:    adcq %r14, %rdx
 ; SSSE3-NEXT:    movq %r9, 24(%r12)
 ; SSSE3-NEXT:    sarq $63, %r9
 ; SSSE3-NEXT:    xorq %r9, %rdx
@@ -3550,7 +3562,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    negl %r15d
 ; SSSE3-NEXT:    movd %r15d, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %rbx, 16(%r12)
+; SSSE3-NEXT:    movq %r10, 16(%r12)
 ; SSSE3-NEXT:    movq %rdi, (%r12)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
@@ -3569,31 +3581,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    movq %r8, %r14
+; SSE41-NEXT:    movq %rcx, %rbp
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rsi, %r11
 ; SSE41-NEXT:    movq %rdi, %r10
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movq %r11, %r12
-; SSE41-NEXT:    sarq $63, %r12
-; SSE41-NEXT:    movq %r14, %rbx
-; SSE41-NEXT:    imulq %r12, %rbx
+; SSE41-NEXT:    movq %r11, %rbx
+; SSE41-NEXT:    sarq $63, %rbx
+; SSE41-NEXT:    movq %rbx, %r15
+; SSE41-NEXT:    andq %r14, %r15
 ; SSE41-NEXT:    movq %r14, %rax
-; SSE41-NEXT:    mulq %r12
+; SSE41-NEXT:    mulq %rbx
 ; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    addq %rbx, %rdx
-; SSE41-NEXT:    imulq %r9, %r12
-; SSE41-NEXT:    addq %rdx, %r12
-; SSE41-NEXT:    movq %r9, %rbx
-; SSE41-NEXT:    sarq $63, %rbx
-; SSE41-NEXT:    movq %rbx, %r13
-; SSE41-NEXT:    imulq %r11, %r13
-; SSE41-NEXT:    movq %rbx, %rax
+; SSE41-NEXT:    movq %rdx, %r12
+; SSE41-NEXT:    subq %r15, %r12
+; SSE41-NEXT:    andq %r9, %rbx
+; SSE41-NEXT:    subq %rbx, %r12
+; SSE41-NEXT:    movq %r9, %r13
+; SSE41-NEXT:    sarq $63, %r13
+; SSE41-NEXT:    movq %r13, %rcx
+; SSE41-NEXT:    andq %r11, %rcx
+; SSE41-NEXT:    movq %r13, %rax
 ; SSE41-NEXT:    mulq %r10
 ; SSE41-NEXT:    movq %rax, %r15
-; SSE41-NEXT:    addq %r13, %rdx
-; SSE41-NEXT:    imulq %r10, %rbx
-; SSE41-NEXT:    addq %rdx, %rbx
+; SSE41-NEXT:    movq %rdx, %rbx
+; SSE41-NEXT:    subq %rcx, %rbx
+; SSE41-NEXT:    andq %r10, %r13
+; SSE41-NEXT:    subq %r13, %rbx
 ; SSE41-NEXT:    addq %rdi, %r15
 ; SSE41-NEXT:    adcq %r12, %rbx
 ; SSE41-NEXT:    movq %r10, %rax
@@ -3613,11 +3627,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    addq %r13, %r10
 ; SSE41-NEXT:    adcq %r14, %r12
 ; SSE41-NEXT:    setb %al
-; SSE41-NEXT:    movzbl %al, %r14d
+; SSE41-NEXT:    movzbl %al, %ecx
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r9
 ; SSE41-NEXT:    addq %r12, %rax
-; SSE41-NEXT:    adcq %r14, %rdx
+; SSE41-NEXT:    adcq %rcx, %rdx
 ; SSE41-NEXT:    addq %r15, %rax
 ; SSE41-NEXT:    adcq %rbx, %rdx
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3628,52 +3642,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    xorl %r15d, %r15d
 ; SSE41-NEXT:    orq %rdx, %r10
 ; SSE41-NEXT:    setne %r15b
-; SSE41-NEXT:    movq %rcx, %rbx
-; SSE41-NEXT:    sarq $63, %rbx
-; SSE41-NEXT:    movq %rsi, %r10
-; SSE41-NEXT:    imulq %rbx, %r10
+; SSE41-NEXT:    movq %rbp, %rcx
+; SSE41-NEXT:    sarq $63, %rcx
+; SSE41-NEXT:    movq %rcx, %r11
+; SSE41-NEXT:    andq %rsi, %r11
 ; SSE41-NEXT:    movq %rsi, %rax
-; SSE41-NEXT:    mulq %rbx
+; SSE41-NEXT:    mulq %rcx
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %r10, %rdx
-; SSE41-NEXT:    imulq %rbp, %rbx
-; SSE41-NEXT:    addq %rdx, %rbx
-; SSE41-NEXT:    movq %rbp, %r10
-; SSE41-NEXT:    sarq $63, %r10
-; SSE41-NEXT:    movq %r10, %r14
-; SSE41-NEXT:    imulq %rcx, %r14
-; SSE41-NEXT:    movq %r10, %rax
-; SSE41-NEXT:    mulq %r8
+; SSE41-NEXT:    movq %rdx, %r10
+; SSE41-NEXT:    subq %r11, %r10
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT:    andq %rax, %rcx
+; SSE41-NEXT:    subq %rcx, %r10
 ; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    addq %r14, %rdx
-; SSE41-NEXT:    imulq %r8, %r10
-; SSE41-NEXT:    addq %rdx, %r10
-; SSE41-NEXT:    addq %r9, %r11
-; SSE41-NEXT:    adcq %rbx, %r10
+; SSE41-NEXT:    movq %rax, %r13
+; SSE41-NEXT:    sarq $63, %r11
+; SSE41-NEXT:    movq %r11, %rcx
+; SSE41-NEXT:    andq %rbp, %rcx
+; SSE41-NEXT:    movq %r11, %rax
+; SSE41-NEXT:    mulq %r8
+; SSE41-NEXT:    movq %rax, %rbx
+; SSE41-NEXT:    movq %rdx, %r14
+; SSE41-NEXT:    subq %rcx, %r14
+; SSE41-NEXT:    andq %r8, %r11
+; SSE41-NEXT:    subq %r11, %r14
+; SSE41-NEXT:    addq %r9, %rbx
+; SSE41-NEXT:    adcq %r10, %r14
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %r9
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    movq %rax, %r10
+; SSE41-NEXT:    movq %rbp, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    movq %rax, %r14
-; SSE41-NEXT:    addq %r9, %r14
+; SSE41-NEXT:    movq %rax, %r11
+; SSE41-NEXT:    addq %r9, %r11
 ; SSE41-NEXT:    adcq $0, %rsi
 ; SSE41-NEXT:    movq %r8, %rax
-; SSE41-NEXT:    mulq %rbp
+; SSE41-NEXT:    mulq %r13
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %r14, %r9
+; SSE41-NEXT:    addq %r11, %r9
 ; SSE41-NEXT:    adcq %rsi, %r8
 ; SSE41-NEXT:    setb %al
-; SSE41-NEXT:    movzbl %al, %esi
-; SSE41-NEXT:    movq %rcx, %rax
-; SSE41-NEXT:    mulq %rbp
+; SSE41-NEXT:    movzbl %al, %ecx
+; SSE41-NEXT:    movq %rbp, %rax
+; SSE41-NEXT:    mulq %r13
 ; SSE41-NEXT:    addq %r8, %rax
-; SSE41-NEXT:    adcq %rsi, %rdx
-; SSE41-NEXT:    addq %r11, %rax
-; SSE41-NEXT:    adcq %r10, %rdx
+; SSE41-NEXT:    adcq %rcx, %rdx
+; SSE41-NEXT:    addq %rbx, %rax
+; SSE41-NEXT:    adcq %r14, %rdx
 ; SSE41-NEXT:    movq %r9, 24(%r12)
 ; SSE41-NEXT:    sarq $63, %r9
 ; SSE41-NEXT:    xorq %r9, %rdx
@@ -3685,7 +3703,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    negl %r15d
 ; SSE41-NEXT:    movd %r15d, %xmm0
 ; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movq %rbx, 16(%r12)
+; SSE41-NEXT:    movq %r10, 16(%r12)
 ; SSE41-NEXT:    movq %rdi, (%r12)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
@@ -3704,31 +3722,33 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    movq %r8, %r14
+; AVX-NEXT:    movq %rcx, %rbp
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rsi, %r11
 ; AVX-NEXT:    movq %rdi, %r10
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movq %r11, %r12
-; AVX-NEXT:    sarq $63, %r12
-; AVX-NEXT:    movq %r14, %rbx
-; AVX-NEXT:    imulq %r12, %rbx
+; AVX-NEXT:    movq %r11, %rbx
+; AVX-NEXT:    sarq $63, %rbx
+; AVX-NEXT:    movq %rbx, %r15
+; AVX-NEXT:    andq %r14, %r15
 ; AVX-NEXT:    movq %r14, %rax
-; AVX-NEXT:    mulq %r12
+; AVX-NEXT:    mulq %rbx
 ; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    addq %rbx, %rdx
-; AVX-NEXT:    imulq %r9, %r12
-; AVX-NEXT:    addq %rdx, %r12
-; AVX-NEXT:    movq %r9, %rbx
-; AVX-NEXT:    sarq $63, %rbx
-; AVX-NEXT:    movq %rbx, %r13
-; AVX-NEXT:    imulq %r11, %r13
-; AVX-NEXT:    movq %rbx, %rax
+; AVX-NEXT:    movq %rdx, %r12
+; AVX-NEXT:    subq %r15, %r12
+; AVX-NEXT:    andq %r9, %rbx
+; AVX-NEXT:    subq %rbx, %r12
+; AVX-NEXT:    movq %r9, %r13
+; AVX-NEXT:    sarq $63, %r13
+; AVX-NEXT:    movq %r13, %rcx
+; AVX-NEXT:    andq %r11, %rcx
+; AVX-NEXT:    movq %r13, %rax
 ; AVX-NEXT:    mulq %r10
 ; AVX-NEXT:    movq %rax, %r15
-; AVX-NEXT:    addq %r13, %rdx
-; AVX-NEXT:    imulq %r10, %rbx
-; AVX-NEXT:    addq %rdx, %rbx
+; AVX-NEXT:    movq %rdx, %rbx
+; AVX-NEXT:    subq %rcx, %rbx
+; AVX-NEXT:    andq %r10, %r13
+; AVX-NEXT:    subq %r13, %rbx
 ; AVX-NEXT:    addq %rdi, %r15
 ; AVX-NEXT:    adcq %r12, %rbx
 ; AVX-NEXT:    movq %r10, %rax
@@ -3748,11 +3768,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    addq %r13, %r10
 ; AVX-NEXT:    adcq %r14, %r12
 ; AVX-NEXT:    setb %al
-; AVX-NEXT:    movzbl %al, %r14d
+; AVX-NEXT:    movzbl %al, %ecx
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r9
 ; AVX-NEXT:    addq %r12, %rax
-; AVX-NEXT:    adcq %r14, %rdx
+; AVX-NEXT:    adcq %rcx, %rdx
 ; AVX-NEXT:    addq %r15, %rax
 ; AVX-NEXT:    adcq %rbx, %rdx
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3763,52 +3783,56 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    xorl %r15d, %r15d
 ; AVX-NEXT:    orq %rdx, %r10
 ; AVX-NEXT:    setne %r15b
-; AVX-NEXT:    movq %rcx, %rbx
-; AVX-NEXT:    sarq $63, %rbx
-; AVX-NEXT:    movq %rsi, %r10
-; AVX-NEXT:    imulq %rbx, %r10
+; AVX-NEXT:    movq %rbp, %rcx
+; AVX-NEXT:    sarq $63, %rcx
+; AVX-NEXT:    movq %rcx, %r11
+; AVX-NEXT:    andq %rsi, %r11
 ; AVX-NEXT:    movq %rsi, %rax
-; AVX-NEXT:    mulq %rbx
+; AVX-NEXT:    mulq %rcx
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %r10, %rdx
-; AVX-NEXT:    imulq %rbp, %rbx
-; AVX-NEXT:    addq %rdx, %rbx
-; AVX-NEXT:    movq %rbp, %r10
-; AVX-NEXT:    sarq $63, %r10
-; AVX-NEXT:    movq %r10, %r14
-; AVX-NEXT:    imulq %rcx, %r14
-; AVX-NEXT:    movq %r10, %rax
-; AVX-NEXT:    mulq %r8
+; AVX-NEXT:    movq %rdx, %r10
+; AVX-NEXT:    subq %r11, %r10
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    andq %rax, %rcx
+; AVX-NEXT:    subq %rcx, %r10
 ; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    addq %r14, %rdx
-; AVX-NEXT:    imulq %r8, %r10
-; AVX-NEXT:    addq %rdx, %r10
-; AVX-NEXT:    addq %r9, %r11
-; AVX-NEXT:    adcq %rbx, %r10
+; AVX-NEXT:    movq %rax, %r13
+; AVX-NEXT:    sarq $63, %r11
+; AVX-NEXT:    movq %r11, %rcx
+; AVX-NEXT:    andq %rbp, %rcx
+; AVX-NEXT:    movq %r11, %rax
+; AVX-NEXT:    mulq %r8
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    subq %rcx, %r14
+; AVX-NEXT:    andq %r8, %r11
+; AVX-NEXT:    subq %r11, %r14
+; AVX-NEXT:    addq %r9, %rbx
+; AVX-NEXT:    adcq %r10, %r14
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %r9
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    movq %rax, %r10
+; AVX-NEXT:    movq %rbp, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    movq %rax, %r14
-; AVX-NEXT:    addq %r9, %r14
+; AVX-NEXT:    movq %rax, %r11
+; AVX-NEXT:    addq %r9, %r11
 ; AVX-NEXT:    adcq $0, %rsi
 ; AVX-NEXT:    movq %r8, %rax
-; AVX-NEXT:    mulq %rbp
+; AVX-NEXT:    mulq %r13
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %r14, %r9
+; AVX-NEXT:    addq %r11, %r9
 ; AVX-NEXT:    adcq %rsi, %r8
 ; AVX-NEXT:    setb %al
-; AVX-NEXT:    movzbl %al, %esi
-; AVX-NEXT:    movq %rcx, %rax
-; AVX-NEXT:    mulq %rbp
+; AVX-NEXT:    movzbl %al, %ecx
+; AVX-NEXT:    movq %rbp, %rax
+; AVX-NEXT:    mulq %r13
 ; AVX-NEXT:    addq %r8, %rax
-; AVX-NEXT:    adcq %rsi, %rdx
-; AVX-NEXT:    addq %r11, %rax
-; AVX-NEXT:    adcq %r10, %rdx
+; AVX-NEXT:    adcq %rcx, %rdx
+; AVX-NEXT:    addq %rbx, %rax
+; AVX-NEXT:    adcq %r14, %rdx
 ; AVX-NEXT:    movq %r9, 24(%r12)
 ; AVX-NEXT:    sarq $63, %r9
 ; AVX-NEXT:    xorq %r9, %rdx
@@ -3820,7 +3844,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    negl %r15d
 ; AVX-NEXT:    vmovd %r15d, %xmm0
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movq %rbx, 16(%r12)
+; AVX-NEXT:    movq %r10, 16(%r12)
 ; AVX-NEXT:    movq %rdi, (%r12)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
@@ -3838,32 +3862,35 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq %r9, %rbp
+; AVX512F-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512F-NEXT:    movq %rcx, %r11
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rsi, %r9
+; AVX512F-NEXT:    movq %rsi, %rbp
+; AVX512F-NEXT:    movq %rdi, %r9
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT:    movq %rcx, %r12
-; AVX512F-NEXT:    sarq $63, %r12
-; AVX512F-NEXT:    movq %r15, %rbx
-; AVX512F-NEXT:    imulq %r12, %rbx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    movq %rcx, %rbx
+; AVX512F-NEXT:    sarq $63, %rbx
+; AVX512F-NEXT:    movq %rbx, %r14
+; AVX512F-NEXT:    andq %r15, %r14
 ; AVX512F-NEXT:    movq %r15, %rax
-; AVX512F-NEXT:    mulq %r12
+; AVX512F-NEXT:    mulq %rbx
 ; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    addq %rbx, %rdx
-; AVX512F-NEXT:    imulq %rsi, %r12
-; AVX512F-NEXT:    addq %rdx, %r12
-; AVX512F-NEXT:    movq %rsi, %rbx
-; AVX512F-NEXT:    sarq $63, %rbx
-; AVX512F-NEXT:    movq %rbx, %r13
-; AVX512F-NEXT:    imulq %r11, %r13
-; AVX512F-NEXT:    movq %rbx, %rax
+; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    subq %r14, %r12
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    subq %rbx, %r12
+; AVX512F-NEXT:    movq %rdi, %r13
+; AVX512F-NEXT:    sarq $63, %r13
+; AVX512F-NEXT:    movq %r13, %rsi
+; AVX512F-NEXT:    andq %r11, %rsi
+; AVX512F-NEXT:    movq %r13, %rax
 ; AVX512F-NEXT:    mulq %r10
 ; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    addq %r13, %rdx
-; AVX512F-NEXT:    imulq %r10, %rbx
-; AVX512F-NEXT:    addq %rdx, %rbx
+; AVX512F-NEXT:    movq %rdx, %rbx
+; AVX512F-NEXT:    subq %rsi, %rbx
+; AVX512F-NEXT:    andq %r10, %r13
+; AVX512F-NEXT:    subq %r13, %rbx
 ; AVX512F-NEXT:    addq %rcx, %r14
 ; AVX512F-NEXT:    adcq %r12, %rbx
 ; AVX512F-NEXT:    movq %r10, %rax
@@ -3877,74 +3904,78 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    addq %r12, %r13
 ; AVX512F-NEXT:    adcq $0, %r15
 ; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    mulq %rdi
 ; AVX512F-NEXT:    movq %rdx, %r12
 ; AVX512F-NEXT:    movq %rax, %r10
 ; AVX512F-NEXT:    addq %r13, %r10
 ; AVX512F-NEXT:    adcq %r15, %r12
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %r15d
+; AVX512F-NEXT:    movzbl %al, %esi
 ; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %rsi
+; AVX512F-NEXT:    mulq %rdi
 ; AVX512F-NEXT:    addq %r12, %rax
-; AVX512F-NEXT:    adcq %r15, %rdx
+; AVX512F-NEXT:    adcq %rsi, %rdx
 ; AVX512F-NEXT:    addq %r14, %rax
 ; AVX512F-NEXT:    adcq %rbx, %rdx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT:    movq %r10, 24(%r12)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512F-NEXT:    movq %r10, 24(%r13)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
 ; AVX512F-NEXT:    orq %rdx, %r10
 ; AVX512F-NEXT:    setne %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
-; AVX512F-NEXT:    movq %r9, %rsi
+; AVX512F-NEXT:    movq %rbp, %rsi
 ; AVX512F-NEXT:    sarq $63, %rsi
-; AVX512F-NEXT:    movq %r8, %r11
-; AVX512F-NEXT:    imulq %rsi, %r11
+; AVX512F-NEXT:    movq %rsi, %rdi
+; AVX512F-NEXT:    andq %r8, %rdi
 ; AVX512F-NEXT:    movq %r8, %rax
 ; AVX512F-NEXT:    mulq %rsi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r11, %rdx
-; AVX512F-NEXT:    imulq %rbp, %rsi
-; AVX512F-NEXT:    addq %rdx, %rsi
-; AVX512F-NEXT:    movq %rbp, %r11
-; AVX512F-NEXT:    sarq $63, %r11
-; AVX512F-NEXT:    movq %r11, %r14
-; AVX512F-NEXT:    imulq %r9, %r14
-; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %rdi
+; AVX512F-NEXT:    movq %rdx, %r11
+; AVX512F-NEXT:    subq %rdi, %r11
+; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT:    andq %rax, %rsi
+; AVX512F-NEXT:    subq %rsi, %r11
 ; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    addq %r14, %rdx
-; AVX512F-NEXT:    imulq %rdi, %r11
-; AVX512F-NEXT:    addq %rdx, %r11
-; AVX512F-NEXT:    addq %r10, %rbx
-; AVX512F-NEXT:    adcq %rsi, %r11
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    mulq %r8
-; AVX512F-NEXT:    movq %rdx, %r10
+; AVX512F-NEXT:    movq %rax, %r12
+; AVX512F-NEXT:    sarq $63, %rbx
+; AVX512F-NEXT:    movq %rbx, %rsi
+; AVX512F-NEXT:    andq %rbp, %rsi
+; AVX512F-NEXT:    movq %rbx, %rax
+; AVX512F-NEXT:    mulq %r9
 ; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    movq %rdx, %r15
+; AVX512F-NEXT:    subq %rsi, %r15
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    subq %rbx, %r15
+; AVX512F-NEXT:    addq %r10, %r14
+; AVX512F-NEXT:    adcq %r11, %r15
 ; AVX512F-NEXT:    movq %r9, %rax
 ; AVX512F-NEXT:    mulq %r8
+; AVX512F-NEXT:    movq %rdx, %r10
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    movq %rbp, %rax
+; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r8
-; AVX512F-NEXT:    movq %rax, %r15
-; AVX512F-NEXT:    addq %r10, %r15
+; AVX512F-NEXT:    movq %rax, %rbx
+; AVX512F-NEXT:    addq %r10, %rbx
 ; AVX512F-NEXT:    adcq $0, %r8
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    mulq %r12
 ; AVX512F-NEXT:    movq %rdx, %rdi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %r15, %r10
+; AVX512F-NEXT:    addq %rbx, %r10
 ; AVX512F-NEXT:    adcq %r8, %rdi
 ; AVX512F-NEXT:    setb %al
 ; AVX512F-NEXT:    movzbl %al, %esi
-; AVX512F-NEXT:    movq %r9, %rax
-; AVX512F-NEXT:    mulq %rbp
+; AVX512F-NEXT:    movq %rbp, %rax
+; AVX512F-NEXT:    mulq %r12
 ; AVX512F-NEXT:    addq %rdi, %rax
 ; AVX512F-NEXT:    adcq %rsi, %rdx
-; AVX512F-NEXT:    addq %rbx, %rax
-; AVX512F-NEXT:    adcq %r11, %rdx
-; AVX512F-NEXT:    movq %r10, 8(%r12)
+; AVX512F-NEXT:    addq %r14, %rax
+; AVX512F-NEXT:    adcq %r15, %rdx
+; AVX512F-NEXT:    movq %r10, 8(%r13)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
@@ -3956,8 +3987,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    korw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT:    movq %rcx, 16(%r12)
-; AVX512F-NEXT:    movq %r14, (%r12)
+; AVX512F-NEXT:    movq %rcx, 16(%r13)
+; AVX512F-NEXT:    movq %r11, (%r13)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
 ; AVX512F-NEXT:    popq %r13
@@ -3974,32 +4005,35 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    movq %r9, %rbp
+; AVX512BW-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512BW-NEXT:    movq %rcx, %r11
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rsi, %r9
+; AVX512BW-NEXT:    movq %rsi, %rbp
+; AVX512BW-NEXT:    movq %rdi, %r9
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512BW-NEXT:    movq %rcx, %r12
-; AVX512BW-NEXT:    sarq $63, %r12
-; AVX512BW-NEXT:    movq %r15, %rbx
-; AVX512BW-NEXT:    imulq %r12, %rbx
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512BW-NEXT:    movq %rcx, %rbx
+; AVX512BW-NEXT:    sarq $63, %rbx
+; AVX512BW-NEXT:    movq %rbx, %r14
+; AVX512BW-NEXT:    andq %r15, %r14
 ; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    mulq %r12
+; AVX512BW-NEXT:    mulq %rbx
 ; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    addq %rbx, %rdx
-; AVX512BW-NEXT:    imulq %rsi, %r12
-; AVX512BW-NEXT:    addq %rdx, %r12
-; AVX512BW-NEXT:    movq %rsi, %rbx
-; AVX512BW-NEXT:    sarq $63, %rbx
-; AVX512BW-NEXT:    movq %rbx, %r13
-; AVX512BW-NEXT:    imulq %r11, %r13
-; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    movq %rdx, %r12
+; AVX512BW-NEXT:    subq %r14, %r12
+; AVX512BW-NEXT:    andq %rdi, %rbx
+; AVX512BW-NEXT:    subq %rbx, %r12
+; AVX512BW-NEXT:    movq %rdi, %r13
+; AVX512BW-NEXT:    sarq $63, %r13
+; AVX512BW-NEXT:    movq %r13, %rsi
+; AVX512BW-NEXT:    andq %r11, %rsi
+; AVX512BW-NEXT:    movq %r13, %rax
 ; AVX512BW-NEXT:    mulq %r10
 ; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    addq %r13, %rdx
-; AVX512BW-NEXT:    imulq %r10, %rbx
-; AVX512BW-NEXT:    addq %rdx, %rbx
+; AVX512BW-NEXT:    movq %rdx, %rbx
+; AVX512BW-NEXT:    subq %rsi, %rbx
+; AVX512BW-NEXT:    andq %r10, %r13
+; AVX512BW-NEXT:    subq %r13, %rbx
 ; AVX512BW-NEXT:    addq %rcx, %r14
 ; AVX512BW-NEXT:    adcq %r12, %rbx
 ; AVX512BW-NEXT:    movq %r10, %rax
@@ -4013,74 +4047,78 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    addq %r12, %r13
 ; AVX512BW-NEXT:    adcq $0, %r15
 ; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    mulq %rdi
 ; AVX512BW-NEXT:    movq %rdx, %r12
 ; AVX512BW-NEXT:    movq %rax, %r10
 ; AVX512BW-NEXT:    addq %r13, %r10
 ; AVX512BW-NEXT:    adcq %r15, %r12
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %r15d
+; AVX512BW-NEXT:    movzbl %al, %esi
 ; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %rsi
+; AVX512BW-NEXT:    mulq %rdi
 ; AVX512BW-NEXT:    addq %r12, %rax
-; AVX512BW-NEXT:    adcq %r15, %rdx
+; AVX512BW-NEXT:    adcq %rsi, %rdx
 ; AVX512BW-NEXT:    addq %r14, %rax
 ; AVX512BW-NEXT:    adcq %rbx, %rdx
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512BW-NEXT:    movq %r10, 24(%r12)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512BW-NEXT:    movq %r10, 24(%r13)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
 ; AVX512BW-NEXT:    orq %rdx, %r10
 ; AVX512BW-NEXT:    setne %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    movq %r9, %rsi
+; AVX512BW-NEXT:    movq %rbp, %rsi
 ; AVX512BW-NEXT:    sarq $63, %rsi
-; AVX512BW-NEXT:    movq %r8, %r11
-; AVX512BW-NEXT:    imulq %rsi, %r11
+; AVX512BW-NEXT:    movq %rsi, %rdi
+; AVX512BW-NEXT:    andq %r8, %rdi
 ; AVX512BW-NEXT:    movq %r8, %rax
 ; AVX512BW-NEXT:    mulq %rsi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r11, %rdx
-; AVX512BW-NEXT:    imulq %rbp, %rsi
-; AVX512BW-NEXT:    addq %rdx, %rsi
-; AVX512BW-NEXT:    movq %rbp, %r11
-; AVX512BW-NEXT:    sarq $63, %r11
-; AVX512BW-NEXT:    movq %r11, %r14
-; AVX512BW-NEXT:    imulq %r9, %r14
-; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %rdi
+; AVX512BW-NEXT:    movq %rdx, %r11
+; AVX512BW-NEXT:    subq %rdi, %r11
+; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT:    andq %rax, %rsi
+; AVX512BW-NEXT:    subq %rsi, %r11
 ; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    addq %r14, %rdx
-; AVX512BW-NEXT:    imulq %rdi, %r11
-; AVX512BW-NEXT:    addq %rdx, %r11
-; AVX512BW-NEXT:    addq %r10, %rbx
-; AVX512BW-NEXT:    adcq %rsi, %r11
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    mulq %r8
-; AVX512BW-NEXT:    movq %rdx, %r10
+; AVX512BW-NEXT:    movq %rax, %r12
+; AVX512BW-NEXT:    sarq $63, %rbx
+; AVX512BW-NEXT:    movq %rbx, %rsi
+; AVX512BW-NEXT:    andq %rbp, %rsi
+; AVX512BW-NEXT:    movq %rbx, %rax
+; AVX512BW-NEXT:    mulq %r9
 ; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    movq %rdx, %r15
+; AVX512BW-NEXT:    subq %rsi, %r15
+; AVX512BW-NEXT:    andq %r9, %rbx
+; AVX512BW-NEXT:    subq %rbx, %r15
+; AVX512BW-NEXT:    addq %r10, %r14
+; AVX512BW-NEXT:    adcq %r11, %r15
 ; AVX512BW-NEXT:    movq %r9, %rax
 ; AVX512BW-NEXT:    mulq %r8
+; AVX512BW-NEXT:    movq %rdx, %r10
+; AVX512BW-NEXT:    movq %rax, %r11
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %r15
-; AVX512BW-NEXT:    addq %r10, %r15
+; AVX512BW-NEXT:    movq %rax, %rbx
+; AVX512BW-NEXT:    addq %r10, %rbx
 ; AVX512BW-NEXT:    adcq $0, %r8
-; AVX512BW-NEXT:    movq %rdi, %rax
-; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    mulq %r12
 ; AVX512BW-NEXT:    movq %rdx, %rdi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %r15, %r10
+; AVX512BW-NEXT:    addq %rbx, %r10
 ; AVX512BW-NEXT:    adcq %r8, %rdi
 ; AVX512BW-NEXT:    setb %al
 ; AVX512BW-NEXT:    movzbl %al, %esi
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    mulq %rbp
+; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    mulq %r12
 ; AVX512BW-NEXT:    addq %rdi, %rax
 ; AVX512BW-NEXT:    adcq %rsi, %rdx
-; AVX512BW-NEXT:    addq %rbx, %rax
-; AVX512BW-NEXT:    adcq %r11, %rdx
-; AVX512BW-NEXT:    movq %r10, 8(%r12)
+; AVX512BW-NEXT:    addq %r14, %rax
+; AVX512BW-NEXT:    adcq %r15, %rdx
+; AVX512BW-NEXT:    movq %r10, 8(%r13)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
@@ -4092,8 +4130,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    korw %k0, %k1, %k1
 ; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    movq %rcx, 16(%r12)
-; AVX512BW-NEXT:    movq %r14, (%r12)
+; AVX512BW-NEXT:    movq %rcx, 16(%r13)
+; AVX512BW-NEXT:    movq %r11, (%r13)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
 ; AVX512BW-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 4adc80b3b8bd6..508b0d7fe0f2b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -215,35 +215,36 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebp
-; WIN32-NEXT:    imull %ecx, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebp, %edx
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    andl %eax, %edi
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    subl %edi, %ecx
+; WIN32-NEXT:    andl %ebp, %esi
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    andl %ebx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    andl %ebx, %ebp
+; WIN32-NEXT:    subl %ebp, %esi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl %ebx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -262,7 +263,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
 ; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
@@ -271,7 +272,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %ebp, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    addl $8, %esp
@@ -573,49 +574,52 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    imull %ecx, %edi
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    andl %eax, %edi
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    imull %esi, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    imull %esi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    andl %ebx, %ecx
+; WIN32-NEXT:    subl %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    andl %ebp, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    subl %edi, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    andl %ebp, %ecx
+; WIN32-NEXT:    subl %ecx, %ebx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    adcl %esi, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %ebp
-; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    addl %ecx, %esi
+; WIN32-NEXT:    adcl %ebp, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    adcl %ecx, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
@@ -999,30 +1003,32 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl %ecx, %ebp
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:    imull %ecx, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebx, %edx
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    andl %eax, %edi
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    subl %edi, %ecx
+; WIN32-NEXT:    andl %ebx, %esi
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    andl %ebp, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    andl %ebp, %ebx
+; WIN32-NEXT:    subl %ebx, %esi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    adcl %ecx, %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -1704,57 +1710,62 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %esi
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    imull %ebp, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl 4(%eax), %eax
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    andl %eax, %ecx
+; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    subl %ecx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %esi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    andl %esi, %edi
+; WIN32-NEXT:    subl %edi, %ebp
+; WIN32-NEXT:    movl %ebx, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    imull %ecx, %edi
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    andl %eax, %ebx
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %edi, %edx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    subl %ebx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    andl %edx, %ecx
+; WIN32-NEXT:    subl %ecx, %edi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ebx, %ecx
+; WIN32-NEXT:    adcl %ebp, %edi
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ebx, %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    adcl %ebp, %ebx
-; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    addl %ebx, %esi
+; WIN32-NEXT:    adcl %ebp, %ecx
+; WIN32-NEXT:    setb %bl
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; WIN32-NEXT:    adcl %edi, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    movzbl %bl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %edi, %edx
 ; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
@@ -1762,7 +1773,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    addl $16, %esp
@@ -1810,35 +1821,35 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %ebp
 ; WIN32-NEXT:    movl 4(%eax), %ebx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    andl %ebp, %edi
 ; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    subl %edi, %ecx
 ; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %ebx
-; WIN32-NEXT:    imull %ecx, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebx, %edx
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    andl %ebx, %esi
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    andl %edx, %ebx
+; WIN32-NEXT:    subl %ebx, %esi
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %edi, %esi
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    adcl %ecx, %esi
+; WIN32-NEXT:    movl %edx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill


        


More information about the llvm-commits mailing list