[llvm] ac92097 - Revert "[DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))"

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 11 16:31:19 PDT 2022


Author: Craig Topper
Date: 2022-10-11T16:30:40-07:00
New Revision: ac9209751ad7f06c42b6ac80cf9c71b3c4bd238d

URL: https://github.com/llvm/llvm-project/commit/ac9209751ad7f06c42b6ac80cf9c71b3c4bd238d
DIFF: https://github.com/llvm/llvm-project/commit/ac9209751ad7f06c42b6ac80cf9c71b3c4bd238d.diff

LOG: Revert "[DAGCombiner] Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))"

This reverts commit 0148df8157f05ecf3b1064508e6f012aefb87dad.

Getting a lit test failures on AMDGPU but I can't reproduce it so far.
Reverting to investigate.

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    llvm/test/CodeGen/PowerPC/pr45448.ll
    llvm/test/CodeGen/RISCV/mul.ll
    llvm/test/CodeGen/RISCV/xaluo.ll
    llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
    llvm/test/CodeGen/X86/extmul128.ll
    llvm/test/CodeGen/X86/muloti.ll
    llvm/test/CodeGen/X86/smul_fix_sat.ll
    llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/xmulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 131364e330232..195238eda3b92 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3939,30 +3939,6 @@ SDValue DAGCombiner::visitMULFIX(SDNode *N) {
   return SDValue();
 }
 
-// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y))
-static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1,
-                                  SelectionDAG &DAG) {
-  if (N0.getOpcode() != ISD::SRA)
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // TODO: Use computeNumSignBits() == BitWidth?
-  unsigned BitWidth = VT.getScalarSizeInBits();
-  ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1));
-  if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1))
-    return SDValue();
-
-  // If optimizing for minsize, we don't want to increase the number of
-  // instructions.
-  if (DAG.getMachineFunction().getFunction().hasMinSize())
-    return SDValue();
-
-  SDLoc dl(N);
-  SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1);
-  return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And);
-}
-
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4173,11 +4149,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     }
   }
 
-  if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG))
-    return V;
-  if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG))
-    return V;
-
   // reassociate mul
   if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
     return RMUL;

diff  --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index c01ec69629f30..e955014371525 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -39,24 +39,21 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
 ; AARCH:       // %bb.0: // %Entry
 ; AARCH-NEXT:    asr x9, x1, #63
 ; AARCH-NEXT:    asr x10, x3, #63
-; AARCH-NEXT:    and x11, x9, x2
-; AARCH-NEXT:    and x14, x10, x1
-; AARCH-NEXT:    umulh x12, x2, x9
-; AARCH-NEXT:    and x9, x9, x3
-; AARCH-NEXT:    umulh x13, x10, x0
-; AARCH-NEXT:    and x10, x10, x0
-; AARCH-NEXT:    sub x12, x12, x11
-; AARCH-NEXT:    neg x11, x11
-; AARCH-NEXT:    sub x13, x13, x14
-; AARCH-NEXT:    sub x9, x12, x9
-; AARCH-NEXT:    sub x12, x13, x10
-; AARCH-NEXT:    neg x10, x10
 ; AARCH-NEXT:    umulh x14, x0, x2
+; AARCH-NEXT:    mov x8, x1
+; AARCH-NEXT:    mul x11, x2, x9
+; AARCH-NEXT:    str wzr, [x4]
+; AARCH-NEXT:    umulh x12, x10, x0
+; AARCH-NEXT:    umulh x13, x2, x9
+; AARCH-NEXT:    madd x12, x10, x1, x12
+; AARCH-NEXT:    add x13, x13, x11
+; AARCH-NEXT:    mul x10, x10, x0
+; AARCH-NEXT:    madd x9, x3, x9, x13
+; AARCH-NEXT:    add x12, x12, x10
 ; AARCH-NEXT:    adds x10, x10, x11
 ; AARCH-NEXT:    mul x11, x1, x2
 ; AARCH-NEXT:    adc x9, x12, x9
 ; AARCH-NEXT:    umulh x13, x1, x2
-; AARCH-NEXT:    mov x8, x1
 ; AARCH-NEXT:    mul x12, x0, x3
 ; AARCH-NEXT:    adds x11, x11, x14
 ; AARCH-NEXT:    umulh x14, x0, x3
@@ -76,7 +73,6 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
 ; AARCH-NEXT:    eor x9, x9, x11
 ; AARCH-NEXT:    eor x10, x10, x11
 ; AARCH-NEXT:    orr x9, x10, x9
-; AARCH-NEXT:    str wzr, [x4]
 ; AARCH-NEXT:    cmp x9, #0
 ; AARCH-NEXT:    cset w9, ne
 ; AARCH-NEXT:    tbz x8, #63, .LBB1_2

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index bac0255ff1ce5..f806149d0c395 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -159,28 +159,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
-; CI-NEXT:    v_ashrrev_i32_e32 v11, 31, v0
+; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
 ; CI-NEXT:    v_mov_b32_e32 v8, 0
-; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8]
-; CI-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
-; CI-NEXT:    v_and_b32_e32 v14, v11, v1
-; CI-NEXT:    v_mov_b32_e32 v1, v10
+; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8]
+; CI-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; CI-NEXT:    v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0
+; CI-NEXT:    v_mov_b32_e32 v7, v10
 ; CI-NEXT:    v_mov_b32_e32 v10, v8
-; CI-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10]
-; CI-NEXT:    v_and_b32_e32 v13, v11, v12
-; CI-NEXT:    v_sub_i32_e32 v9, vcc, 0, v14
-; CI-NEXT:    v_subb_u32_e32 v10, vcc, 0, v13, vcc
-; CI-NEXT:    v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10]
-; CI-NEXT:    v_mov_b32_e32 v0, v8
-; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CI-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1]
-; CI-NEXT:    v_add_i32_e32 v8, vcc, v0, v9
-; CI-NEXT:    v_addc_u32_e32 v9, vcc, v1, v10, vcc
-; CI-NEXT:    v_mov_b32_e32 v1, v7
+; CI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
+; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
+; CI-NEXT:    v_add_i32_e32 v9, vcc, v7, v9
+; CI-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
+; CI-NEXT:    v_add_i32_e32 v7, vcc, v9, v0
+; CI-NEXT:    v_addc_u32_e32 v9, vcc, v10, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, v8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v6, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CI-NEXT:    v_addc_u32_e32 v2, vcc, v8, v4, vcc
+; CI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -188,64 +184,60 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; SI-NEXT:    v_mul_lo_u32 v11, v6, v1
+; SI-NEXT:    v_mul_hi_u32 v12, v0, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; SI-NEXT:    v_and_b32_e32 v9, v7, v0
-; SI-NEXT:    v_and_b32_e32 v10, v6, v1
-; SI-NEXT:    v_mul_lo_u32 v13, v6, v1
-; SI-NEXT:    v_mul_hi_u32 v14, v0, v1
-; SI-NEXT:    v_and_b32_e32 v8, v7, v6
-; SI-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; SI-NEXT:    v_mul_hi_u32 v10, v6, v7
-; SI-NEXT:    v_mul_i32_i24_e32 v11, v6, v7
-; SI-NEXT:    v_mul_hi_u32 v6, v6, v1
-; SI-NEXT:    v_mul_hi_u32 v12, v0, v7
-; SI-NEXT:    v_mul_lo_u32 v7, v0, v7
-; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v8, vcc
-; SI-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; SI-NEXT:    v_addc_u32_e32 v12, vcc, 0, v12, vcc
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; SI-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
-; SI-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; SI-NEXT:    v_mul_hi_u32 v14, v6, v1
+; SI-NEXT:    v_mul_lo_u32 v13, v0, v7
+; SI-NEXT:    v_mul_hi_u32 v10, v0, v7
+; SI-NEXT:    v_add_i32_e32 v12, vcc, v11, v12
+; SI-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
+; SI-NEXT:    v_mul_hi_u32 v8, v6, v7
+; SI-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; SI-NEXT:    v_mul_i32_i24_e32 v9, v6, v7
+; SI-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; SI-NEXT:    v_mul_hi_i32 v6, v1, v6
+; SI-NEXT:    v_mul_hi_i32 v7, v7, v0
+; SI-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
+; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
+; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v11
 ; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
-; SI-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v9
-; SI-NEXT:    v_subb_u32_e32 v8, vcc, v10, v8, vcc
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, v9, v10
+; SI-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, v7, v3, vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, v6, v4, vcc
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, v8, v5, vcc
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v12, v3, vcc
+; SI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
-; GFX9-NEXT:    v_and_b32_e32 v6, v14, v1
-; GFX9-NEXT:    v_mov_b32_e32 v11, 0
-; GFX9-NEXT:    v_mov_b32_e32 v10, v9
-; GFX9-NEXT:    v_and_b32_e32 v7, v14, v15
-; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11]
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-NEXT:    v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13]
-; GFX9-NEXT:    v_mov_b32_e32 v12, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v7
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
+; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v11
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
+; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
+; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, v10
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -254,30 +246,27 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v16, 31, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v17, 31, v1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, v16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10]
-; GFX11-NEXT:    v_and_b32_e32 v9, v16, v17
-; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, 0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v1, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
+; GFX11-NEXT:    v_mad_i64_i32 v[9:10], null, v1, v14, 0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9]
-; GFX11-NEXT:    v_add_co_u32 v12, s0, v7, v1
-; GFX11-NEXT:    v_mov_b32_e32 v7, v11
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, null, 0, 0, s0
+; GFX11-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
+; GFX11-NEXT:    v_add_co_u32 v7, s0, v7, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13]
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, 0, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
+; GFX11-NEXT:    v_mov_b32_e32 v7, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)

diff  --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index c3337c78a4770..0f8014df8adca 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -25,8 +25,7 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK-NEXT:    rldic r5, r5, 4, 32
 ; CHECK-NEXT:    crnot 4*cr5+lt, eq
 ; CHECK-NEXT:    mulhdu r3, r3, r5
-; CHECK-NEXT:    and r6, r4, r5
-; CHECK-NEXT:    sub r6, r3, r6
+; CHECK-NEXT:    maddld r6, r4, r5, r3
 ; CHECK-NEXT:    cmpld cr1, r6, r3
 ; CHECK-NEXT:    mulhdu. r3, r4, r5
 ; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10

diff  --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 986e799428e57..3923c4340d30e 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1480,18 +1480,18 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32IM-NEXT:    add a5, a6, a2
 ; RV32IM-NEXT:    mul a7, a1, a3
 ; RV32IM-NEXT:    add t0, a7, a5
-; RV32IM-NEXT:    and t1, a4, a0
-; RV32IM-NEXT:    sub a2, t0, t1
+; RV32IM-NEXT:    mul t1, a4, a0
+; RV32IM-NEXT:    add a2, t0, t1
 ; RV32IM-NEXT:    sltu t2, a2, t0
 ; RV32IM-NEXT:    sltu a7, t0, a7
 ; RV32IM-NEXT:    sltu a5, a5, a6
 ; RV32IM-NEXT:    mulhu a3, a1, a3
 ; RV32IM-NEXT:    add a3, a3, a5
 ; RV32IM-NEXT:    add a3, a3, a7
-; RV32IM-NEXT:    and a1, a4, a1
+; RV32IM-NEXT:    mul a1, a4, a1
 ; RV32IM-NEXT:    mulhu a0, a4, a0
-; RV32IM-NEXT:    sub a0, a0, a1
-; RV32IM-NEXT:    sub a0, a0, t1
+; RV32IM-NEXT:    add a0, a0, a1
+; RV32IM-NEXT:    add a0, a0, t1
 ; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    add a1, a0, t2
 ; RV32IM-NEXT:    mv a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index f3391b2816495..f6963fd674d3e 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -961,10 +961,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    .cfi_offset s2, -12
 ; RV32-NEXT:    mulhu a5, a0, a2
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    add a5, a6, a5
@@ -980,34 +978,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    mul t0, a1, a3
 ; RV32-NEXT:    add t1, t0, a7
 ; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    and t3, t2, a2
+; RV32-NEXT:    mul t3, a2, t2
 ; RV32-NEXT:    srai t4, a3, 31
-; RV32-NEXT:    and t5, t4, a0
-; RV32-NEXT:    neg t6, t5
-; RV32-NEXT:    sub s0, t6, t3
-; RV32-NEXT:    add s1, t1, s0
-; RV32-NEXT:    sltu s2, s1, t1
+; RV32-NEXT:    mul t5, t4, a0
+; RV32-NEXT:    add t6, t5, t3
+; RV32-NEXT:    add s0, t1, t6
+; RV32-NEXT:    sltu s1, s0, t1
 ; RV32-NEXT:    sltu t0, t1, t0
 ; RV32-NEXT:    sltu a6, a7, a6
 ; RV32-NEXT:    mulhu a7, a1, a3
 ; RV32-NEXT:    add a6, a7, a6
 ; RV32-NEXT:    add a6, a6, t0
 ; RV32-NEXT:    mulhu a7, a2, t2
-; RV32-NEXT:    sub a7, a7, t3
-; RV32-NEXT:    and a3, t2, a3
-; RV32-NEXT:    sub a3, a7, a3
-; RV32-NEXT:    and a1, t4, a1
+; RV32-NEXT:    add a7, a7, t3
+; RV32-NEXT:    mul a3, a3, t2
+; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    mul a1, t4, a1
 ; RV32-NEXT:    mulhu a7, t4, a0
-; RV32-NEXT:    sub a1, a7, a1
-; RV32-NEXT:    sub a1, a1, t5
+; RV32-NEXT:    add a1, a7, a1
+; RV32-NEXT:    add a1, a1, t5
 ; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    sltu a3, s0, t6
+; RV32-NEXT:    sltu a3, t6, t5
 ; RV32-NEXT:    add a1, a1, a3
 ; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    add a1, a1, s2
+; RV32-NEXT:    add a1, a1, s1
 ; RV32-NEXT:    srai a3, a5, 31
 ; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    xor a3, s1, a3
+; RV32-NEXT:    xor a3, s0, a3
 ; RV32-NEXT:    or a1, a3, a1
 ; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    mul a0, a0, a2
@@ -1016,7 +1013,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -1036,10 +1032,8 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    .cfi_offset s1, -8
-; RV32ZBA-NEXT:    .cfi_offset s2, -12
 ; RV32ZBA-NEXT:    mulhu a5, a0, a2
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    add a5, a6, a5
@@ -1055,34 +1049,33 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    mul t0, a1, a3
 ; RV32ZBA-NEXT:    add t1, t0, a7
 ; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    and t3, t2, a2
+; RV32ZBA-NEXT:    mul t3, a2, t2
 ; RV32ZBA-NEXT:    srai t4, a3, 31
-; RV32ZBA-NEXT:    and t5, t4, a0
-; RV32ZBA-NEXT:    neg t6, t5
-; RV32ZBA-NEXT:    sub s0, t6, t3
-; RV32ZBA-NEXT:    add s1, t1, s0
-; RV32ZBA-NEXT:    sltu s2, s1, t1
+; RV32ZBA-NEXT:    mul t5, t4, a0
+; RV32ZBA-NEXT:    add t6, t5, t3
+; RV32ZBA-NEXT:    add s0, t1, t6
+; RV32ZBA-NEXT:    sltu s1, s0, t1
 ; RV32ZBA-NEXT:    sltu t0, t1, t0
 ; RV32ZBA-NEXT:    sltu a6, a7, a6
 ; RV32ZBA-NEXT:    mulhu a7, a1, a3
 ; RV32ZBA-NEXT:    add a6, a7, a6
 ; RV32ZBA-NEXT:    add a6, a6, t0
 ; RV32ZBA-NEXT:    mulhu a7, a2, t2
-; RV32ZBA-NEXT:    sub a7, a7, t3
-; RV32ZBA-NEXT:    and a3, t2, a3
-; RV32ZBA-NEXT:    sub a3, a7, a3
-; RV32ZBA-NEXT:    and a1, t4, a1
+; RV32ZBA-NEXT:    add a7, a7, t3
+; RV32ZBA-NEXT:    mul a3, a3, t2
+; RV32ZBA-NEXT:    add a3, a7, a3
+; RV32ZBA-NEXT:    mul a1, t4, a1
 ; RV32ZBA-NEXT:    mulhu a7, t4, a0
-; RV32ZBA-NEXT:    sub a1, a7, a1
-; RV32ZBA-NEXT:    sub a1, a1, t5
+; RV32ZBA-NEXT:    add a1, a7, a1
+; RV32ZBA-NEXT:    add a1, a1, t5
 ; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    sltu a3, s0, t6
+; RV32ZBA-NEXT:    sltu a3, t6, t5
 ; RV32ZBA-NEXT:    add a1, a1, a3
 ; RV32ZBA-NEXT:    add a1, a6, a1
-; RV32ZBA-NEXT:    add a1, a1, s2
+; RV32ZBA-NEXT:    add a1, a1, s1
 ; RV32ZBA-NEXT:    srai a3, a5, 31
 ; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    xor a3, s1, a3
+; RV32ZBA-NEXT:    xor a3, s0, a3
 ; RV32ZBA-NEXT:    or a1, a3, a1
 ; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    mul a0, a0, a2
@@ -1091,7 +1084,6 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 ; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -1123,8 +1115,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    srai a1, a1, 31
-; RV32-NEXT:    andi a6, a1, 13
-; RV32-NEXT:    sub a6, a5, a6
+; RV32-NEXT:    mul a6, a1, a3
+; RV32-NEXT:    add a6, a5, a6
 ; RV32-NEXT:    srai a7, a4, 31
 ; RV32-NEXT:    xor t0, a6, a7
 ; RV32-NEXT:    sltu a5, a6, a5
@@ -1160,8 +1152,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    srai a1, a1, 31
-; RV32ZBA-NEXT:    andi a6, a1, 13
-; RV32ZBA-NEXT:    sub a6, a5, a6
+; RV32ZBA-NEXT:    mul a6, a1, a3
+; RV32ZBA-NEXT:    add a6, a5, a6
 ; RV32ZBA-NEXT:    srai a7, a4, 31
 ; RV32ZBA-NEXT:    xor t0, a6, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
@@ -2360,9 +2352,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -2378,34 +2368,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    and t2, t1, a2
+; RV32-NEXT:    mul t2, a2, t1
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    and t4, t3, a0
-; RV32-NEXT:    neg t5, t4
-; RV32-NEXT:    sub t6, t5, t2
-; RV32-NEXT:    add s0, t0, t6
-; RV32-NEXT:    sltu s1, s0, t0
+; RV32-NEXT:    mul t4, t3, a0
+; RV32-NEXT:    add t5, t4, t2
+; RV32-NEXT:    add t6, t0, t5
+; RV32-NEXT:    sltu s0, t6, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a6, a2, t1
-; RV32-NEXT:    sub a6, a6, t2
-; RV32-NEXT:    and a7, t1, a3
-; RV32-NEXT:    sub a6, a6, a7
-; RV32-NEXT:    and a7, t3, a1
+; RV32-NEXT:    add a6, a6, t2
+; RV32-NEXT:    mul a7, a3, t1
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mul a7, t3, a1
 ; RV32-NEXT:    mulhu t0, t3, a0
-; RV32-NEXT:    sub a7, t0, a7
-; RV32-NEXT:    sub a7, a7, t4
+; RV32-NEXT:    add a7, t0, a7
+; RV32-NEXT:    add a7, a7, t4
 ; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    sltu a7, t6, t5
+; RV32-NEXT:    sltu a7, t5, t4
 ; RV32-NEXT:    add a6, a6, a7
 ; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, a5, s1
+; RV32-NEXT:    add a5, a5, s0
 ; RV32-NEXT:    srai a4, a4, 31
 ; RV32-NEXT:    xor a5, a5, a4
-; RV32-NEXT:    xor a4, s0, a4
+; RV32-NEXT:    xor a4, t6, a4
 ; RV32-NEXT:    or a4, a4, a5
 ; RV32-NEXT:    bnez a4, .LBB46_2
 ; RV32-NEXT:  # %bb.1: # %entry
@@ -2413,7 +2402,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:  .LBB46_2: # %entry
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -2433,9 +2421,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
-; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -2451,34 +2437,33 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    and t2, t1, a2
+; RV32ZBA-NEXT:    mul t2, a2, t1
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    and t4, t3, a0
-; RV32ZBA-NEXT:    neg t5, t4
-; RV32ZBA-NEXT:    sub t6, t5, t2
-; RV32ZBA-NEXT:    add s0, t0, t6
-; RV32ZBA-NEXT:    sltu s1, s0, t0
+; RV32ZBA-NEXT:    mul t4, t3, a0
+; RV32ZBA-NEXT:    add t5, t4, t2
+; RV32ZBA-NEXT:    add t6, t0, t5
+; RV32ZBA-NEXT:    sltu s0, t6, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a6, a2, t1
-; RV32ZBA-NEXT:    sub a6, a6, t2
-; RV32ZBA-NEXT:    and a7, t1, a3
-; RV32ZBA-NEXT:    sub a6, a6, a7
-; RV32ZBA-NEXT:    and a7, t3, a1
+; RV32ZBA-NEXT:    add a6, a6, t2
+; RV32ZBA-NEXT:    mul a7, a3, t1
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mul a7, t3, a1
 ; RV32ZBA-NEXT:    mulhu t0, t3, a0
-; RV32ZBA-NEXT:    sub a7, t0, a7
-; RV32ZBA-NEXT:    sub a7, a7, t4
+; RV32ZBA-NEXT:    add a7, t0, a7
+; RV32ZBA-NEXT:    add a7, a7, t4
 ; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    sltu a7, t6, t5
+; RV32ZBA-NEXT:    sltu a7, t5, t4
 ; RV32ZBA-NEXT:    add a6, a6, a7
 ; RV32ZBA-NEXT:    add a5, a5, a6
-; RV32ZBA-NEXT:    add a5, a5, s1
+; RV32ZBA-NEXT:    add a5, a5, s0
 ; RV32ZBA-NEXT:    srai a4, a4, 31
 ; RV32ZBA-NEXT:    xor a5, a5, a4
-; RV32ZBA-NEXT:    xor a4, s0, a4
+; RV32ZBA-NEXT:    xor a4, t6, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
 ; RV32ZBA-NEXT:    bnez a4, .LBB46_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
@@ -2486,7 +2471,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mv a1, a3
 ; RV32ZBA-NEXT:  .LBB46_2: # %entry
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -2513,9 +2497,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -2531,38 +2513,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    and t2, t1, a2
+; RV32-NEXT:    mul t2, a2, t1
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    and t4, t3, a0
-; RV32-NEXT:    neg t5, t4
-; RV32-NEXT:    sub t6, t5, t2
-; RV32-NEXT:    add s0, t0, t6
-; RV32-NEXT:    sltu s1, s0, t0
+; RV32-NEXT:    mul t4, t3, a0
+; RV32-NEXT:    add t5, t4, t2
+; RV32-NEXT:    add t6, t0, t5
+; RV32-NEXT:    sltu s0, t6, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    sub a2, a2, t2
-; RV32-NEXT:    and a3, t1, a3
-; RV32-NEXT:    sub a2, a2, a3
-; RV32-NEXT:    and a1, t3, a1
+; RV32-NEXT:    add a2, a2, t2
+; RV32-NEXT:    mul a3, a3, t1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    sub a0, a0, t4
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, t4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t6, t5
+; RV32-NEXT:    sltu a1, t5, t4
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s1
+; RV32-NEXT:    add a0, a0, s0
 ; RV32-NEXT:    srai a1, a4, 31
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a1, s0, a1
+; RV32-NEXT:    xor a1, t6, a1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -2580,9 +2560,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
-; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -2598,38 +2576,36 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    and t2, t1, a2
+; RV32ZBA-NEXT:    mul t2, a2, t1
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    and t4, t3, a0
-; RV32ZBA-NEXT:    neg t5, t4
-; RV32ZBA-NEXT:    sub t6, t5, t2
-; RV32ZBA-NEXT:    add s0, t0, t6
-; RV32ZBA-NEXT:    sltu s1, s0, t0
+; RV32ZBA-NEXT:    mul t4, t3, a0
+; RV32ZBA-NEXT:    add t5, t4, t2
+; RV32ZBA-NEXT:    add t6, t0, t5
+; RV32ZBA-NEXT:    sltu s0, t6, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    sub a2, a2, t2
-; RV32ZBA-NEXT:    and a3, t1, a3
-; RV32ZBA-NEXT:    sub a2, a2, a3
-; RV32ZBA-NEXT:    and a1, t3, a1
+; RV32ZBA-NEXT:    add a2, a2, t2
+; RV32ZBA-NEXT:    mul a3, a3, t1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    sub a0, a0, a1
-; RV32ZBA-NEXT:    sub a0, a0, t4
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, t4
 ; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t6, t5
+; RV32ZBA-NEXT:    sltu a1, t5, t4
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s1
+; RV32ZBA-NEXT:    add a0, a0, s0
 ; RV32ZBA-NEXT:    srai a1, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    xor a1, s0, a1
+; RV32ZBA-NEXT:    xor a1, t6, a1
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    seqz a0, a0
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -3477,9 +3453,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
-; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    add a4, a5, a4
@@ -3495,34 +3469,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    mul a7, a1, a3
 ; RV32-NEXT:    add t0, a7, a6
 ; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    and t2, t1, a2
+; RV32-NEXT:    mul t2, a2, t1
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    and t4, t3, a0
-; RV32-NEXT:    neg t5, t4
-; RV32-NEXT:    sub t6, t5, t2
-; RV32-NEXT:    add s0, t0, t6
-; RV32-NEXT:    sltu s1, s0, t0
+; RV32-NEXT:    mul t4, t3, a0
+; RV32-NEXT:    add t5, t4, t2
+; RV32-NEXT:    add t6, t0, t5
+; RV32-NEXT:    sltu s0, t6, t0
 ; RV32-NEXT:    sltu a7, t0, a7
 ; RV32-NEXT:    sltu a5, a6, a5
 ; RV32-NEXT:    mulhu a6, a1, a3
 ; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    add a5, a5, a7
 ; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    sub a2, a2, t2
-; RV32-NEXT:    and a3, t1, a3
-; RV32-NEXT:    sub a2, a2, a3
-; RV32-NEXT:    and a1, t3, a1
+; RV32-NEXT:    add a2, a2, t2
+; RV32-NEXT:    mul a3, a3, t1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    sub a0, a0, t4
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, a0, t4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t6, t5
+; RV32-NEXT:    sltu a1, t5, t4
 ; RV32-NEXT:    add a0, a0, a1
 ; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s1
+; RV32-NEXT:    add a0, a0, s0
 ; RV32-NEXT:    srai a1, a4, 31
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a1, s0, a1
+; RV32-NEXT:    xor a1, t6, a1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    beqz a0, .LBB61_2
 ; RV32-NEXT:  # %bb.1: # %overflow
@@ -3532,7 +3505,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:  .LBB61_3: # %overflow
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    ret
 ;
@@ -3554,9 +3526,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    addi sp, sp, -16
 ; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
 ; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
-; RV32ZBA-NEXT:    .cfi_offset s1, -8
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    add a4, a5, a4
@@ -3572,34 +3542,33 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    mul a7, a1, a3
 ; RV32ZBA-NEXT:    add t0, a7, a6
 ; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    and t2, t1, a2
+; RV32ZBA-NEXT:    mul t2, a2, t1
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    and t4, t3, a0
-; RV32ZBA-NEXT:    neg t5, t4
-; RV32ZBA-NEXT:    sub t6, t5, t2
-; RV32ZBA-NEXT:    add s0, t0, t6
-; RV32ZBA-NEXT:    sltu s1, s0, t0
+; RV32ZBA-NEXT:    mul t4, t3, a0
+; RV32ZBA-NEXT:    add t5, t4, t2
+; RV32ZBA-NEXT:    add t6, t0, t5
+; RV32ZBA-NEXT:    sltu s0, t6, t0
 ; RV32ZBA-NEXT:    sltu a7, t0, a7
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a3
 ; RV32ZBA-NEXT:    add a5, a6, a5
 ; RV32ZBA-NEXT:    add a5, a5, a7
 ; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    sub a2, a2, t2
-; RV32ZBA-NEXT:    and a3, t1, a3
-; RV32ZBA-NEXT:    sub a2, a2, a3
-; RV32ZBA-NEXT:    and a1, t3, a1
+; RV32ZBA-NEXT:    add a2, a2, t2
+; RV32ZBA-NEXT:    mul a3, a3, t1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
-; RV32ZBA-NEXT:    sub a0, a0, a1
-; RV32ZBA-NEXT:    sub a0, a0, t4
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, t4
 ; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t6, t5
+; RV32ZBA-NEXT:    sltu a1, t5, t4
 ; RV32ZBA-NEXT:    add a0, a0, a1
 ; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s1
+; RV32ZBA-NEXT:    add a0, a0, s0
 ; RV32ZBA-NEXT:    srai a1, a4, 31
 ; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    xor a1, s0, a1
+; RV32ZBA-NEXT:    xor a1, t6, a1
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    beqz a0, .LBB61_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
@@ -3609,7 +3578,6 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:  .LBB61_3: # %overflow
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    addi sp, sp, 16
 ; RV32ZBA-NEXT:    ret
 ;
@@ -3657,8 +3625,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-NEXT:    add a6, a4, a6
 ; RV32-NEXT:    sub t1, a6, a1
 ; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    andi t3, t2, -13
-; RV32-NEXT:    sub t3, a5, t3
+; RV32-NEXT:    mul t3, t2, a2
+; RV32-NEXT:    sub t3, t3, a0
 ; RV32-NEXT:    add t4, t1, t3
 ; RV32-NEXT:    sltu t5, t4, t1
 ; RV32-NEXT:    neg t6, a1
@@ -3719,8 +3687,8 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32ZBA-NEXT:    add a6, a4, a6
 ; RV32ZBA-NEXT:    sub t1, a6, a1
 ; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    andi t3, t2, -13
-; RV32ZBA-NEXT:    sub t3, a5, t3
+; RV32ZBA-NEXT:    mul t3, t2, a2
+; RV32ZBA-NEXT:    sub t3, t3, a0
 ; RV32ZBA-NEXT:    add t4, t1, t3
 ; RV32ZBA-NEXT:    sltu t5, t4, t1
 ; RV32ZBA-NEXT:    neg t6, a1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 9cb0ec4d98fb5..217caeebe6335 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -38,23 +38,22 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0246_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r2, r4, r3, r0
+; CHECK-NEXT:    umull r2, r5, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    and.w r2, r1, r0, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r0, r1, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r3, r0, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r0, r0, r3, asr #31
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r2, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    mla r0, r3, r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -68,23 +67,22 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_0246:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull lr, r12, r0, r1
-; CHECK-NEXT:    umull r2, r4, r0, r3
+; CHECK-NEXT:    umull r2, r5, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    and.w r2, r0, r1, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r1, r0, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r0, r3, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r0, r3, r0, asr #31
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r5
+; CHECK-NEXT:    mla r0, r4, r3, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -132,24 +130,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_1357_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vrev64.32 q1, q0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r2, r4, r3, r0
+; CHECK-NEXT:    umull r2, r5, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    and.w r2, r1, r0, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r0, r1, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r3, r0, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r0, r0, r3, asr #31
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r2, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    mla r0, r3, r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -163,24 +160,23 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_1357:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vrev64.32 q1, q0
+; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    umull lr, r12, r0, r1
-; CHECK-NEXT:    umull r2, r4, r0, r3
+; CHECK-NEXT:    umull r2, r5, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
-; CHECK-NEXT:    and.w r2, r0, r1, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r1, r0, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r0, r3, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r0, r3, r0, asr #31
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r5
+; CHECK-NEXT:    mla r0, r4, r3, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
   %out1 = sext <2 x i32> %shuf1 to <2 x i64>
@@ -234,39 +230,36 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0213_ext0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    umull r2, r5, r3, r0
 ; CHECK-NEXT:    umull lr, r12, r1, r0
-; CHECK-NEXT:    umull r2, r4, r3, r0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT:    and.w r2, r1, r0, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r0, r1, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r3, r0, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r3, r0, r3, asr #31
-; CHECK-NEXT:    subs r2, r2, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    asrs r2, r0, #31
+; CHECK-NEXT:    mla r4, r1, r2, r12
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r5, r3, r2, r5
+; CHECK-NEXT:    asrs r3, r3, #31
+; CHECK-NEXT:    mla r1, r1, r0, r4
+; CHECK-NEXT:    mla r3, r3, r0, r5
+; CHECK-NEXT:    vmov q1[3], q1[1], r3, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    and.w r2, r1, r0, asr #31
-; CHECK-NEXT:    umull r3, r4, r1, r0
-; CHECK-NEXT:    and.w r1, r0, r1, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    sub.w r12, r2, r1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    umull r4, r1, r2, r0
+; CHECK-NEXT:    umull r3, r5, r1, r0
+; CHECK-NEXT:    mla r5, r1, r2, r5
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    mla r12, r1, r0, r5
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    umull r4, r1, r5, r0
+; CHECK-NEXT:    mla r1, r5, r2, r1
+; CHECK-NEXT:    asrs r2, r5, #31
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    and.w r3, r2, r0, asr #31
-; CHECK-NEXT:    and.w r0, r0, r2, asr #31
-; CHECK-NEXT:    subs r1, r1, r3
-; CHECK-NEXT:    subs r0, r1, r0
+; CHECK-NEXT:    mla r0, r2, r0, r1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   %out1 = sext <4 x i32> %shuf1 to <4 x i64>
@@ -280,39 +273,36 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_ext0_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, lr}
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    asrs r4, r0, #31
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    umull r2, r5, r0, r3
 ; CHECK-NEXT:    umull lr, r12, r0, r1
-; CHECK-NEXT:    umull r2, r4, r0, r3
 ; CHECK-NEXT:    vmov q1[2], q1[0], r2, lr
-; CHECK-NEXT:    and.w r2, r0, r1, asr #31
-; CHECK-NEXT:    sub.w r2, r12, r2
-; CHECK-NEXT:    and.w r1, r1, r0, asr #31
-; CHECK-NEXT:    subs r1, r2, r1
-; CHECK-NEXT:    and.w r2, r0, r3, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    and.w r3, r3, r0, asr #31
-; CHECK-NEXT:    subs r2, r2, r3
+; CHECK-NEXT:    asrs r2, r1, #31
+; CHECK-NEXT:    mla r2, r0, r2, r12
+; CHECK-NEXT:    mla r1, r4, r1, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r2, r0, r2, r5
+; CHECK-NEXT:    mla r2, r4, r3, r2
 ; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
 ; CHECK-NEXT:    vmov r1, s2
-; CHECK-NEXT:    umull r3, r4, r0, r1
-; CHECK-NEXT:    and.w r2, r0, r1, asr #31
-; CHECK-NEXT:    and.w r1, r1, r0, asr #31
-; CHECK-NEXT:    subs r2, r4, r2
-; CHECK-NEXT:    sub.w r12, r2, r1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    umull r4, r1, r0, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r3
-; CHECK-NEXT:    and.w r3, r0, r2, asr #31
-; CHECK-NEXT:    and.w r0, r2, r0, asr #31
-; CHECK-NEXT:    subs r1, r1, r3
-; CHECK-NEXT:    subs r0, r1, r0
+; CHECK-NEXT:    umull r2, r3, r0, r1
+; CHECK-NEXT:    asrs r5, r1, #31
+; CHECK-NEXT:    mla r3, r0, r5, r3
+; CHECK-NEXT:    mla r12, r4, r1, r3
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    umull r5, r1, r0, r3
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    asrs r2, r3, #31
+; CHECK-NEXT:    mla r0, r0, r2, r1
+; CHECK-NEXT:    mla r0, r4, r3, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
-; CHECK-NEXT:    pop {r4, pc}
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
   %out1 = sext <4 x i32> %shuf1 to <4 x i64>

diff  --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll
index a2d8211888618..a7f2959a23c2c 100644
--- a/llvm/test/CodeGen/X86/extmul128.ll
+++ b/llvm/test/CodeGen/X86/extmul128.ll
@@ -29,37 +29,6 @@ define i128 @i64_zext_sext_i128(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %rsi
 ; CHECK-NEXT:    sarq $63, %rsi
-; CHECK-NEXT:    andq %rdi, %rsi
-; CHECK-NEXT:    subq %rsi, %rdx
-; CHECK-NEXT:    retq
-  %aa = zext i64 %a to i128
-  %bb = sext i64 %b to i128
-  %cc = mul i128 %aa, %bb
-  ret i128 %cc
-}
-
-define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
-; CHECK-LABEL: i64_sext_zext_i128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    sarq $63, %rcx
-; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    andq %rsi, %rcx
-; CHECK-NEXT:    subq %rcx, %rdx
-; CHECK-NEXT:    retq
-  %aa = sext i64 %a to i128
-  %bb = zext i64 %b to i128
-  %cc = mul i128 %aa, %bb
-  ret i128 %cc
-}
-
-define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
-; CHECK-LABEL: i64_zext_sext_i128_minsize:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    sarq $63, %rsi
 ; CHECK-NEXT:    imulq %rdi, %rsi
 ; CHECK-NEXT:    addq %rsi, %rdx
 ; CHECK-NEXT:    retq
@@ -69,8 +38,8 @@ define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize {
   ret i128 %cc
 }
 
-define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize {
-; CHECK-LABEL: i64_sext_zext_i128_minsize:
+define i128 @i64_sext_zext_i128(i64 %a, i64 %b) {
+; CHECK-LABEL: i64_sext_zext_i128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movq %rdi, %rcx

diff  --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 3733306f354a5..9a6cf0b065662 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -7,39 +7,34 @@
 define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
 ; CHECK-LABEL: x:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %r15
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset %rbx, -32
-; CHECK-NEXT:    .cfi_offset %r14, -24
-; CHECK-NEXT:    .cfi_offset %r15, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %r14, -16
 ; CHECK-NEXT:    movq %rdx, %r11
 ; CHECK-NEXT:    movq %rdi, %r9
-; CHECK-NEXT:    movq %rsi, %rdi
-; CHECK-NEXT:    sarq $63, %rdi
-; CHECK-NEXT:    movq %rdi, %r10
-; CHECK-NEXT:    andq %rdx, %r10
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    sarq $63, %rbx
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    imulq %rbx, %rdi
 ; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    mulq %rdi
+; CHECK-NEXT:    mulq %rbx
 ; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    movq %rdx, %rbx
-; CHECK-NEXT:    subq %r10, %rbx
-; CHECK-NEXT:    andq %rcx, %rdi
-; CHECK-NEXT:    subq %rdi, %rbx
-; CHECK-NEXT:    movq %rcx, %r14
-; CHECK-NEXT:    sarq $63, %r14
-; CHECK-NEXT:    movq %r14, %r15
-; CHECK-NEXT:    andq %rsi, %r15
-; CHECK-NEXT:    movq %r14, %rax
+; CHECK-NEXT:    addq %rdi, %rdx
+; CHECK-NEXT:    imulq %rcx, %rbx
+; CHECK-NEXT:    addq %rdx, %rbx
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    sarq $63, %rdi
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    imulq %rsi, %r14
+; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %r9
 ; CHECK-NEXT:    movq %rax, %r10
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    subq %r15, %rdi
-; CHECK-NEXT:    andq %r9, %r14
-; CHECK-NEXT:    subq %r14, %rdi
+; CHECK-NEXT:    addq %r14, %rdx
+; CHECK-NEXT:    imulq %r9, %rdi
+; CHECK-NEXT:    addq %rdx, %rdi
 ; CHECK-NEXT:    addq %r8, %r10
 ; CHECK-NEXT:    adcq %rbx, %rdi
 ; CHECK-NEXT:    movq %r9, %rax
@@ -77,7 +72,6 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
 ; CHECK-NEXT:    movq %r9, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    popq %r15
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB0_1: ## %overflow
 ; CHECK-NEXT:    ud2

diff  --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 07debb11b92f7..996601ed3be64 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,8 +369,8 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 32
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 28
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -378,54 +378,52 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    subl %ebx, %esi
-; X86-NEXT:    andl %ebp, %edi
-; X86-NEXT:    subl %edi, %esi
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    imull %ebp, %ebx
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    andl %ecx, %ebp
+; X86-NEXT:    imull %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edx, %edi
-; X86-NEXT:    subl %edi, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    xorl %edi, %edx
@@ -436,11 +434,11 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    xorl $2147483647, %esi # imm = 0x7FFFFFFF
 ; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    notl %ecx
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovel (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel %ebp, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

diff  --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 6631c6c4cc014..367ca660cda14 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -9,44 +9,39 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %r12
-; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 40
-; X64-NEXT:    .cfi_offset %rbx, -40
-; X64-NEXT:    .cfi_offset %r12, -32
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -32
 ; X64-NEXT:    .cfi_offset %r14, -24
 ; X64-NEXT:    .cfi_offset %r15, -16
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    movq %rsi, %r9
-; X64-NEXT:    sarq $63, %r9
-; X64-NEXT:    movq %r9, %r11
-; X64-NEXT:    andq %rdx, %r11
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    sarq $63, %r14
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    imulq %r14, %rdi
 ; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    subq %r11, %r14
-; X64-NEXT:    andq %rcx, %r9
-; X64-NEXT:    subq %r9, %r14
-; X64-NEXT:    movq %rcx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    andq %rsi, %r12
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    imulq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    imulq %rsi, %r15
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    subq %r12, %r9
-; X64-NEXT:    andq %r10, %r15
-; X64-NEXT:    subq %r15, %r9
-; X64-NEXT:    addq %rdi, %r11
-; X64-NEXT:    adcq %r14, %r9
+; X64-NEXT:    addq %r15, %rdx
+; X64-NEXT:    imulq %r10, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    adcq %r14, %rdi
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rbx
@@ -66,16 +61,15 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
 ; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %r9, %rdx
+; X64-NEXT:    adcq %rdi, %rdx
 ; X64-NEXT:    movq %r10, 8(%r8)
 ; X64-NEXT:    sarq $63, %r10
 ; X64-NEXT:    xorq %r10, %rdx
 ; X64-NEXT:    xorq %rax, %r10
 ; X64-NEXT:    orq %rdx, %r10
 ; X64-NEXT:    setne %al
-; X64-NEXT:    movq %rdi, (%r8)
+; X64-NEXT:    movq %r9, (%r8)
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    retq
@@ -90,8 +84,8 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $60, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 80
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 76
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -105,229 +99,226 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl %edi, %esi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    subl %esi, %edi
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    imull %esi, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    subl %esi, %ebp
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    subl %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl (%esp), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    mull %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %esi, %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -335,7 +326,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -369,239 +360,234 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rsi, %r10
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rdi, %rbx
-; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r10, %r14
+; X64-NEXT:    adcq %rcx, %r12
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %edi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r12, %rsi
-; X64-NEXT:    adcq %rdi, %rdx
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movzbl %al, %ecx
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    movq %r8, %rdi
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r12, %rbx
+; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %r8, %rcx
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r8, %r13
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %rdi
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    adcq %r12, %r10
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r15, %r9
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rdi, %r8
-; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    addq %r10, %r8
+; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbx, %rbp
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    adcq %r14, %rbp
+; X64-NEXT:    adcq $0, %rbx
+; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, %rcx
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rdi, %r10
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r10, %r9
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r10, %r15
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r11, %r13
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    addq %r8, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rbp, %r15
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r10
+; X64-NEXT:    addq %r8, %r14
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rbp, %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq %r12, %rdi
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rbx, %r13
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    adcq $0, %rdi
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    adcq %rsi, %r10
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    adcq %rdi, %r9
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r10, %rbx
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq %r13, %r15
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rdi, %r8
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movzbl %r11b, %eax
-; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    addq %r13, %r11
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %r10, %rsi
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    movq %rbx, %r13
+; X64-NEXT:    movq %rbx, %r10
 ; X64-NEXT:    sarq $63, %r13
 ; X64-NEXT:    movq %r13, %rcx
-; X64-NEXT:    andq %r9, %rcx
+; X64-NEXT:    imulq %r12, %rcx
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    subq %rcx, %r10
-; X64-NEXT:    andq %r13, %r14
-; X64-NEXT:    subq %r14, %r10
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT:    andq %r14, %rsi
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    imulq %r13, %r15
+; X64-NEXT:    addq %rdx, %r15
+; X64-NEXT:    movq %r13, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    imulq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    subq %rsi, %rcx
-; X64-NEXT:    andq %r13, %rdi
-; X64-NEXT:    subq %rdi, %rcx
-; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    imulq %r13, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %r10, %rcx
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    adcq %r15, %rsi
+; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r11, %r15
+; X64-NEXT:    addq %r9, %r15
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    addq %rsi, %r15
-; X64-NEXT:    adcq %r11, %r13
-; X64-NEXT:    setb %sil
+; X64-NEXT:    addq %rcx, %r15
+; X64-NEXT:    adcq %r9, %r13
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    movzbl %sil, %esi
-; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    movzbl %cl, %r9d
+; X64-NEXT:    adcq %rdx, %r9
 ; X64-NEXT:    addq %r8, %r13
-; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:    sarq $63, %r9
-; X64-NEXT:    movq %r9, %r8
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    andq %rax, %r8
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    imulq %r12, %r8
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    subq %r8, %r14
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %rdx, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    subq %rax, %r14
-; X64-NEXT:    movq %r9, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    andq %rax, %r12
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    subq %r12, %r8
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    imulq %r12, %rbx
+; X64-NEXT:    addq %r8, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT:    andq %r9, %rax
-; X64-NEXT:    subq %rax, %r8
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    adcq %r14, %r8
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    addq %r11, %r14
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    imulq %r12, %r10
+; X64-NEXT:    addq %rdx, %r10
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    movq %rsi, %rbx
+; X64-NEXT:    addq %r11, %rbx
 ; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    addq %rax, %rbx
 ; X64-NEXT:    adcq %rdx, %r11
-; X64-NEXT:    setb %r9b
+; X64-NEXT:    setb %cl
 ; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rdx, %rax
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    adcq %r8, %rax
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %r14
+; X64-NEXT:    addq %r8, %r11
+; X64-NEXT:    adcq %r10, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, %rbx
 ; X64-NEXT:    adcq %r13, %r11
-; X64-NEXT:    adcq %rsi, %rax
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbx, %r11
+; X64-NEXT:    adcq %r9, %rax
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r14, %r11
 ; X64-NEXT:    adcq %rbp, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    xorq %rdx, %rax
-; X64-NEXT:    xorq %rdx, %r14
-; X64-NEXT:    orq %rax, %r14
-; X64-NEXT:    xorq %rdx, %r11
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    orq %r11, %rdx
-; X64-NEXT:    orq %r14, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    xorq %rcx, %rax
+; X64-NEXT:    xorq %rcx, %rbx
+; X64-NEXT:    orq %rax, %rbx
+; X64-NEXT:    xorq %rcx, %r11
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    orq %r11, %rcx
+; X64-NEXT:    orq %rbx, %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %rsi, 24(%rax)
+; X64-NEXT:    movq %rdx, 24(%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -627,399 +613,400 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 172
+; X86-NEXT:    subl $156, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 176
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
@@ -1032,9 +1019,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
@@ -1047,13 +1034,41 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
@@ -1062,117 +1077,89 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
@@ -1188,25 +1175,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
@@ -1214,75 +1201,76 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %eax, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    subl %ebx, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
@@ -1292,266 +1280,263 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edi, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    subl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edi, %edx
-; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl %edi, %edx
+; X86-NEXT:    imull %edi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movzbl %bl, %ebp
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movzbl %bl, %ebx
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl %edx, %ecx
+; X86-NEXT:    imull %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %dl
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    andl %ebp, %esi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    subl %esi, %ebx
-; X86-NEXT:    andl %ebp, %ecx
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    subl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    subl %eax, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %esi
 ; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %edi, %edx
+; X86-NEXT:    xorl %edi, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    xorl %edi, %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 28(%eax)
+; X86-NEXT:    movl %ebx, 28(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1567,7 +1552,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, 24(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $152, %esp
+; X86-NEXT:    addl $156, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 641663d9eedfe..dbec86755a969 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3297,33 +3297,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    movq %r8, %r14
-; SSE2-NEXT:    movq %rcx, %rbp
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rsi, %r11
 ; SSE2-NEXT:    movq %rdi, %r10
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movq %r11, %rbx
-; SSE2-NEXT:    sarq $63, %rbx
-; SSE2-NEXT:    movq %rbx, %r15
-; SSE2-NEXT:    andq %r14, %r15
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    sarq $63, %r12
+; SSE2-NEXT:    movq %r14, %rbx
+; SSE2-NEXT:    imulq %r12, %rbx
 ; SSE2-NEXT:    movq %r14, %rax
-; SSE2-NEXT:    mulq %rbx
+; SSE2-NEXT:    mulq %r12
 ; SSE2-NEXT:    movq %rax, %rdi
-; SSE2-NEXT:    movq %rdx, %r12
-; SSE2-NEXT:    subq %r15, %r12
-; SSE2-NEXT:    andq %r9, %rbx
-; SSE2-NEXT:    subq %rbx, %r12
-; SSE2-NEXT:    movq %r9, %r13
-; SSE2-NEXT:    sarq $63, %r13
-; SSE2-NEXT:    movq %r13, %rcx
-; SSE2-NEXT:    andq %r11, %rcx
-; SSE2-NEXT:    movq %r13, %rax
+; SSE2-NEXT:    addq %rbx, %rdx
+; SSE2-NEXT:    imulq %r9, %r12
+; SSE2-NEXT:    addq %rdx, %r12
+; SSE2-NEXT:    movq %r9, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %rbx, %r13
+; SSE2-NEXT:    imulq %r11, %r13
+; SSE2-NEXT:    movq %rbx, %rax
 ; SSE2-NEXT:    mulq %r10
 ; SSE2-NEXT:    movq %rax, %r15
-; SSE2-NEXT:    movq %rdx, %rbx
-; SSE2-NEXT:    subq %rcx, %rbx
-; SSE2-NEXT:    andq %r10, %r13
-; SSE2-NEXT:    subq %r13, %rbx
+; SSE2-NEXT:    addq %r13, %rdx
+; SSE2-NEXT:    imulq %r10, %rbx
+; SSE2-NEXT:    addq %rdx, %rbx
 ; SSE2-NEXT:    addq %rdi, %r15
 ; SSE2-NEXT:    adcq %r12, %rbx
 ; SSE2-NEXT:    movq %r10, %rax
@@ -3343,11 +3341,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    addq %r13, %r10
 ; SSE2-NEXT:    adcq %r14, %r12
 ; SSE2-NEXT:    setb %al
-; SSE2-NEXT:    movzbl %al, %ecx
+; SSE2-NEXT:    movzbl %al, %r14d
 ; SSE2-NEXT:    movq %r11, %rax
 ; SSE2-NEXT:    mulq %r9
 ; SSE2-NEXT:    addq %r12, %rax
-; SSE2-NEXT:    adcq %rcx, %rdx
+; SSE2-NEXT:    adcq %r14, %rdx
 ; SSE2-NEXT:    addq %r15, %rax
 ; SSE2-NEXT:    adcq %rbx, %rdx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3358,56 +3356,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    xorl %r15d, %r15d
 ; SSE2-NEXT:    orq %rdx, %r10
 ; SSE2-NEXT:    setne %r15b
-; SSE2-NEXT:    movq %rbp, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %rcx, %r11
-; SSE2-NEXT:    andq %rsi, %r11
+; SSE2-NEXT:    movq %rcx, %rbx
+; SSE2-NEXT:    sarq $63, %rbx
+; SSE2-NEXT:    movq %rsi, %r10
+; SSE2-NEXT:    imulq %rbx, %r10
 ; SSE2-NEXT:    movq %rsi, %rax
-; SSE2-NEXT:    mulq %rcx
+; SSE2-NEXT:    mulq %rbx
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    movq %rdx, %r10
-; SSE2-NEXT:    subq %r11, %r10
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andq %rax, %rcx
-; SSE2-NEXT:    subq %rcx, %r10
-; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    movq %rax, %r13
-; SSE2-NEXT:    sarq $63, %r11
-; SSE2-NEXT:    movq %r11, %rcx
-; SSE2-NEXT:    andq %rbp, %rcx
-; SSE2-NEXT:    movq %r11, %rax
+; SSE2-NEXT:    addq %r10, %rdx
+; SSE2-NEXT:    imulq %rbp, %rbx
+; SSE2-NEXT:    addq %rdx, %rbx
+; SSE2-NEXT:    movq %rbp, %r10
+; SSE2-NEXT:    sarq $63, %r10
+; SSE2-NEXT:    movq %r10, %r14
+; SSE2-NEXT:    imulq %rcx, %r14
+; SSE2-NEXT:    movq %r10, %rax
 ; SSE2-NEXT:    mulq %r8
-; SSE2-NEXT:    movq %rax, %rbx
-; SSE2-NEXT:    movq %rdx, %r14
-; SSE2-NEXT:    subq %rcx, %r14
-; SSE2-NEXT:    andq %r8, %r11
-; SSE2-NEXT:    subq %r11, %r14
-; SSE2-NEXT:    addq %r9, %rbx
-; SSE2-NEXT:    adcq %r10, %r14
+; SSE2-NEXT:    movq %rax, %r11
+; SSE2-NEXT:    addq %r14, %rdx
+; SSE2-NEXT:    imulq %r8, %r10
+; SSE2-NEXT:    addq %rdx, %r10
+; SSE2-NEXT:    addq %r9, %r11
+; SSE2-NEXT:    adcq %rbx, %r10
 ; SSE2-NEXT:    movq %r8, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %r9
-; SSE2-NEXT:    movq %rax, %r10
-; SSE2-NEXT:    movq %rbp, %rax
+; SSE2-NEXT:    movq %rax, %rbx
+; SSE2-NEXT:    movq %rcx, %rax
 ; SSE2-NEXT:    mulq %rsi
 ; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    movq %rax, %r11
-; SSE2-NEXT:    addq %r9, %r11
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    addq %r9, %r14
 ; SSE2-NEXT:    adcq $0, %rsi
 ; SSE2-NEXT:    movq %r8, %rax
-; SSE2-NEXT:    mulq %r13
+; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    movq %rdx, %r8
 ; SSE2-NEXT:    movq %rax, %r9
-; SSE2-NEXT:    addq %r11, %r9
+; SSE2-NEXT:    addq %r14, %r9
 ; SSE2-NEXT:    adcq %rsi, %r8
 ; SSE2-NEXT:    setb %al
-; SSE2-NEXT:    movzbl %al, %ecx
-; SSE2-NEXT:    movq %rbp, %rax
-; SSE2-NEXT:    mulq %r13
+; SSE2-NEXT:    movzbl %al, %esi
+; SSE2-NEXT:    movq %rcx, %rax
+; SSE2-NEXT:    mulq %rbp
 ; SSE2-NEXT:    addq %r8, %rax
-; SSE2-NEXT:    adcq %rcx, %rdx
-; SSE2-NEXT:    addq %rbx, %rax
-; SSE2-NEXT:    adcq %r14, %rdx
+; SSE2-NEXT:    adcq %rsi, %rdx
+; SSE2-NEXT:    addq %r11, %rax
+; SSE2-NEXT:    adcq %r10, %rdx
 ; SSE2-NEXT:    movq %r9, 24(%r12)
 ; SSE2-NEXT:    sarq $63, %r9
 ; SSE2-NEXT:    xorq %r9, %rdx
@@ -3420,7 +3414,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE2-NEXT:    negl %r15d
 ; SSE2-NEXT:    movd %r15d, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq %r10, 16(%r12)
+; SSE2-NEXT:    movq %rbx, 16(%r12)
 ; SSE2-NEXT:    movq %rdi, (%r12)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
@@ -3439,33 +3433,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    pushq %r12
 ; SSSE3-NEXT:    pushq %rbx
 ; SSSE3-NEXT:    movq %r8, %r14
-; SSSE3-NEXT:    movq %rcx, %rbp
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rsi, %r11
 ; SSSE3-NEXT:    movq %rdi, %r10
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSSE3-NEXT:    movq %r11, %rbx
-; SSSE3-NEXT:    sarq $63, %rbx
-; SSSE3-NEXT:    movq %rbx, %r15
-; SSSE3-NEXT:    andq %r14, %r15
+; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSSE3-NEXT:    movq %r11, %r12
+; SSSE3-NEXT:    sarq $63, %r12
+; SSSE3-NEXT:    movq %r14, %rbx
+; SSSE3-NEXT:    imulq %r12, %rbx
 ; SSSE3-NEXT:    movq %r14, %rax
-; SSSE3-NEXT:    mulq %rbx
+; SSSE3-NEXT:    mulq %r12
 ; SSSE3-NEXT:    movq %rax, %rdi
-; SSSE3-NEXT:    movq %rdx, %r12
-; SSSE3-NEXT:    subq %r15, %r12
-; SSSE3-NEXT:    andq %r9, %rbx
-; SSSE3-NEXT:    subq %rbx, %r12
-; SSSE3-NEXT:    movq %r9, %r13
-; SSSE3-NEXT:    sarq $63, %r13
-; SSSE3-NEXT:    movq %r13, %rcx
-; SSSE3-NEXT:    andq %r11, %rcx
-; SSSE3-NEXT:    movq %r13, %rax
+; SSSE3-NEXT:    addq %rbx, %rdx
+; SSSE3-NEXT:    imulq %r9, %r12
+; SSSE3-NEXT:    addq %rdx, %r12
+; SSSE3-NEXT:    movq %r9, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movq %rbx, %r13
+; SSSE3-NEXT:    imulq %r11, %r13
+; SSSE3-NEXT:    movq %rbx, %rax
 ; SSSE3-NEXT:    mulq %r10
 ; SSSE3-NEXT:    movq %rax, %r15
-; SSSE3-NEXT:    movq %rdx, %rbx
-; SSSE3-NEXT:    subq %rcx, %rbx
-; SSSE3-NEXT:    andq %r10, %r13
-; SSSE3-NEXT:    subq %r13, %rbx
+; SSSE3-NEXT:    addq %r13, %rdx
+; SSSE3-NEXT:    imulq %r10, %rbx
+; SSSE3-NEXT:    addq %rdx, %rbx
 ; SSSE3-NEXT:    addq %rdi, %r15
 ; SSSE3-NEXT:    adcq %r12, %rbx
 ; SSSE3-NEXT:    movq %r10, %rax
@@ -3485,11 +3477,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    addq %r13, %r10
 ; SSSE3-NEXT:    adcq %r14, %r12
 ; SSSE3-NEXT:    setb %al
-; SSSE3-NEXT:    movzbl %al, %ecx
+; SSSE3-NEXT:    movzbl %al, %r14d
 ; SSSE3-NEXT:    movq %r11, %rax
 ; SSSE3-NEXT:    mulq %r9
 ; SSSE3-NEXT:    addq %r12, %rax
-; SSSE3-NEXT:    adcq %rcx, %rdx
+; SSSE3-NEXT:    adcq %r14, %rdx
 ; SSSE3-NEXT:    addq %r15, %rax
 ; SSSE3-NEXT:    adcq %rbx, %rdx
 ; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3500,56 +3492,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    xorl %r15d, %r15d
 ; SSSE3-NEXT:    orq %rdx, %r10
 ; SSSE3-NEXT:    setne %r15b
-; SSSE3-NEXT:    movq %rbp, %rcx
-; SSSE3-NEXT:    sarq $63, %rcx
-; SSSE3-NEXT:    movq %rcx, %r11
-; SSSE3-NEXT:    andq %rsi, %r11
+; SSSE3-NEXT:    movq %rcx, %rbx
+; SSSE3-NEXT:    sarq $63, %rbx
+; SSSE3-NEXT:    movq %rsi, %r10
+; SSSE3-NEXT:    imulq %rbx, %r10
 ; SSSE3-NEXT:    movq %rsi, %rax
-; SSSE3-NEXT:    mulq %rcx
+; SSSE3-NEXT:    mulq %rbx
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    movq %rdx, %r10
-; SSSE3-NEXT:    subq %r11, %r10
-; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT:    andq %rax, %rcx
-; SSSE3-NEXT:    subq %rcx, %r10
-; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    movq %rax, %r13
-; SSSE3-NEXT:    sarq $63, %r11
-; SSSE3-NEXT:    movq %r11, %rcx
-; SSSE3-NEXT:    andq %rbp, %rcx
-; SSSE3-NEXT:    movq %r11, %rax
+; SSSE3-NEXT:    addq %r10, %rdx
+; SSSE3-NEXT:    imulq %rbp, %rbx
+; SSSE3-NEXT:    addq %rdx, %rbx
+; SSSE3-NEXT:    movq %rbp, %r10
+; SSSE3-NEXT:    sarq $63, %r10
+; SSSE3-NEXT:    movq %r10, %r14
+; SSSE3-NEXT:    imulq %rcx, %r14
+; SSSE3-NEXT:    movq %r10, %rax
 ; SSSE3-NEXT:    mulq %r8
-; SSSE3-NEXT:    movq %rax, %rbx
-; SSSE3-NEXT:    movq %rdx, %r14
-; SSSE3-NEXT:    subq %rcx, %r14
-; SSSE3-NEXT:    andq %r8, %r11
-; SSSE3-NEXT:    subq %r11, %r14
-; SSSE3-NEXT:    addq %r9, %rbx
-; SSSE3-NEXT:    adcq %r10, %r14
+; SSSE3-NEXT:    movq %rax, %r11
+; SSSE3-NEXT:    addq %r14, %rdx
+; SSSE3-NEXT:    imulq %r8, %r10
+; SSSE3-NEXT:    addq %rdx, %r10
+; SSSE3-NEXT:    addq %r9, %r11
+; SSSE3-NEXT:    adcq %rbx, %r10
 ; SSSE3-NEXT:    movq %r8, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %r9
-; SSSE3-NEXT:    movq %rax, %r10
-; SSSE3-NEXT:    movq %rbp, %rax
+; SSSE3-NEXT:    movq %rax, %rbx
+; SSSE3-NEXT:    movq %rcx, %rax
 ; SSSE3-NEXT:    mulq %rsi
 ; SSSE3-NEXT:    movq %rdx, %rsi
-; SSSE3-NEXT:    movq %rax, %r11
-; SSSE3-NEXT:    addq %r9, %r11
+; SSSE3-NEXT:    movq %rax, %r14
+; SSSE3-NEXT:    addq %r9, %r14
 ; SSSE3-NEXT:    adcq $0, %rsi
 ; SSSE3-NEXT:    movq %r8, %rax
-; SSSE3-NEXT:    mulq %r13
+; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    movq %rdx, %r8
 ; SSSE3-NEXT:    movq %rax, %r9
-; SSSE3-NEXT:    addq %r11, %r9
+; SSSE3-NEXT:    addq %r14, %r9
 ; SSSE3-NEXT:    adcq %rsi, %r8
 ; SSSE3-NEXT:    setb %al
-; SSSE3-NEXT:    movzbl %al, %ecx
-; SSSE3-NEXT:    movq %rbp, %rax
-; SSSE3-NEXT:    mulq %r13
+; SSSE3-NEXT:    movzbl %al, %esi
+; SSSE3-NEXT:    movq %rcx, %rax
+; SSSE3-NEXT:    mulq %rbp
 ; SSSE3-NEXT:    addq %r8, %rax
-; SSSE3-NEXT:    adcq %rcx, %rdx
-; SSSE3-NEXT:    addq %rbx, %rax
-; SSSE3-NEXT:    adcq %r14, %rdx
+; SSSE3-NEXT:    adcq %rsi, %rdx
+; SSSE3-NEXT:    addq %r11, %rax
+; SSSE3-NEXT:    adcq %r10, %rdx
 ; SSSE3-NEXT:    movq %r9, 24(%r12)
 ; SSSE3-NEXT:    sarq $63, %r9
 ; SSSE3-NEXT:    xorq %r9, %rdx
@@ -3562,7 +3550,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSSE3-NEXT:    negl %r15d
 ; SSSE3-NEXT:    movd %r15d, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movq %r10, 16(%r12)
+; SSSE3-NEXT:    movq %rbx, 16(%r12)
 ; SSSE3-NEXT:    movq %rdi, (%r12)
 ; SSSE3-NEXT:    popq %rbx
 ; SSSE3-NEXT:    popq %r12
@@ -3581,33 +3569,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    pushq %r12
 ; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    movq %r8, %r14
-; SSE41-NEXT:    movq %rcx, %rbp
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rsi, %r11
 ; SSE41-NEXT:    movq %rdi, %r10
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE41-NEXT:    movq %r11, %rbx
-; SSE41-NEXT:    sarq $63, %rbx
-; SSE41-NEXT:    movq %rbx, %r15
-; SSE41-NEXT:    andq %r14, %r15
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT:    movq %r11, %r12
+; SSE41-NEXT:    sarq $63, %r12
+; SSE41-NEXT:    movq %r14, %rbx
+; SSE41-NEXT:    imulq %r12, %rbx
 ; SSE41-NEXT:    movq %r14, %rax
-; SSE41-NEXT:    mulq %rbx
+; SSE41-NEXT:    mulq %r12
 ; SSE41-NEXT:    movq %rax, %rdi
-; SSE41-NEXT:    movq %rdx, %r12
-; SSE41-NEXT:    subq %r15, %r12
-; SSE41-NEXT:    andq %r9, %rbx
-; SSE41-NEXT:    subq %rbx, %r12
-; SSE41-NEXT:    movq %r9, %r13
-; SSE41-NEXT:    sarq $63, %r13
-; SSE41-NEXT:    movq %r13, %rcx
-; SSE41-NEXT:    andq %r11, %rcx
-; SSE41-NEXT:    movq %r13, %rax
+; SSE41-NEXT:    addq %rbx, %rdx
+; SSE41-NEXT:    imulq %r9, %r12
+; SSE41-NEXT:    addq %rdx, %r12
+; SSE41-NEXT:    movq %r9, %rbx
+; SSE41-NEXT:    sarq $63, %rbx
+; SSE41-NEXT:    movq %rbx, %r13
+; SSE41-NEXT:    imulq %r11, %r13
+; SSE41-NEXT:    movq %rbx, %rax
 ; SSE41-NEXT:    mulq %r10
 ; SSE41-NEXT:    movq %rax, %r15
-; SSE41-NEXT:    movq %rdx, %rbx
-; SSE41-NEXT:    subq %rcx, %rbx
-; SSE41-NEXT:    andq %r10, %r13
-; SSE41-NEXT:    subq %r13, %rbx
+; SSE41-NEXT:    addq %r13, %rdx
+; SSE41-NEXT:    imulq %r10, %rbx
+; SSE41-NEXT:    addq %rdx, %rbx
 ; SSE41-NEXT:    addq %rdi, %r15
 ; SSE41-NEXT:    adcq %r12, %rbx
 ; SSE41-NEXT:    movq %r10, %rax
@@ -3627,11 +3613,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    addq %r13, %r10
 ; SSE41-NEXT:    adcq %r14, %r12
 ; SSE41-NEXT:    setb %al
-; SSE41-NEXT:    movzbl %al, %ecx
+; SSE41-NEXT:    movzbl %al, %r14d
 ; SSE41-NEXT:    movq %r11, %rax
 ; SSE41-NEXT:    mulq %r9
 ; SSE41-NEXT:    addq %r12, %rax
-; SSE41-NEXT:    adcq %rcx, %rdx
+; SSE41-NEXT:    adcq %r14, %rdx
 ; SSE41-NEXT:    addq %r15, %rax
 ; SSE41-NEXT:    adcq %rbx, %rdx
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3642,56 +3628,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    xorl %r15d, %r15d
 ; SSE41-NEXT:    orq %rdx, %r10
 ; SSE41-NEXT:    setne %r15b
-; SSE41-NEXT:    movq %rbp, %rcx
-; SSE41-NEXT:    sarq $63, %rcx
-; SSE41-NEXT:    movq %rcx, %r11
-; SSE41-NEXT:    andq %rsi, %r11
+; SSE41-NEXT:    movq %rcx, %rbx
+; SSE41-NEXT:    sarq $63, %rbx
+; SSE41-NEXT:    movq %rsi, %r10
+; SSE41-NEXT:    imulq %rbx, %r10
 ; SSE41-NEXT:    movq %rsi, %rax
-; SSE41-NEXT:    mulq %rcx
+; SSE41-NEXT:    mulq %rbx
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    movq %rdx, %r10
-; SSE41-NEXT:    subq %r11, %r10
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE41-NEXT:    andq %rax, %rcx
-; SSE41-NEXT:    subq %rcx, %r10
-; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    movq %rax, %r13
-; SSE41-NEXT:    sarq $63, %r11
-; SSE41-NEXT:    movq %r11, %rcx
-; SSE41-NEXT:    andq %rbp, %rcx
-; SSE41-NEXT:    movq %r11, %rax
+; SSE41-NEXT:    addq %r10, %rdx
+; SSE41-NEXT:    imulq %rbp, %rbx
+; SSE41-NEXT:    addq %rdx, %rbx
+; SSE41-NEXT:    movq %rbp, %r10
+; SSE41-NEXT:    sarq $63, %r10
+; SSE41-NEXT:    movq %r10, %r14
+; SSE41-NEXT:    imulq %rcx, %r14
+; SSE41-NEXT:    movq %r10, %rax
 ; SSE41-NEXT:    mulq %r8
-; SSE41-NEXT:    movq %rax, %rbx
-; SSE41-NEXT:    movq %rdx, %r14
-; SSE41-NEXT:    subq %rcx, %r14
-; SSE41-NEXT:    andq %r8, %r11
-; SSE41-NEXT:    subq %r11, %r14
-; SSE41-NEXT:    addq %r9, %rbx
-; SSE41-NEXT:    adcq %r10, %r14
+; SSE41-NEXT:    movq %rax, %r11
+; SSE41-NEXT:    addq %r14, %rdx
+; SSE41-NEXT:    imulq %r8, %r10
+; SSE41-NEXT:    addq %rdx, %r10
+; SSE41-NEXT:    addq %r9, %r11
+; SSE41-NEXT:    adcq %rbx, %r10
 ; SSE41-NEXT:    movq %r8, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %r9
-; SSE41-NEXT:    movq %rax, %r10
-; SSE41-NEXT:    movq %rbp, %rax
+; SSE41-NEXT:    movq %rax, %rbx
+; SSE41-NEXT:    movq %rcx, %rax
 ; SSE41-NEXT:    mulq %rsi
 ; SSE41-NEXT:    movq %rdx, %rsi
-; SSE41-NEXT:    movq %rax, %r11
-; SSE41-NEXT:    addq %r9, %r11
+; SSE41-NEXT:    movq %rax, %r14
+; SSE41-NEXT:    addq %r9, %r14
 ; SSE41-NEXT:    adcq $0, %rsi
 ; SSE41-NEXT:    movq %r8, %rax
-; SSE41-NEXT:    mulq %r13
+; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    movq %rdx, %r8
 ; SSE41-NEXT:    movq %rax, %r9
-; SSE41-NEXT:    addq %r11, %r9
+; SSE41-NEXT:    addq %r14, %r9
 ; SSE41-NEXT:    adcq %rsi, %r8
 ; SSE41-NEXT:    setb %al
-; SSE41-NEXT:    movzbl %al, %ecx
-; SSE41-NEXT:    movq %rbp, %rax
-; SSE41-NEXT:    mulq %r13
+; SSE41-NEXT:    movzbl %al, %esi
+; SSE41-NEXT:    movq %rcx, %rax
+; SSE41-NEXT:    mulq %rbp
 ; SSE41-NEXT:    addq %r8, %rax
-; SSE41-NEXT:    adcq %rcx, %rdx
-; SSE41-NEXT:    addq %rbx, %rax
-; SSE41-NEXT:    adcq %r14, %rdx
+; SSE41-NEXT:    adcq %rsi, %rdx
+; SSE41-NEXT:    addq %r11, %rax
+; SSE41-NEXT:    adcq %r10, %rdx
 ; SSE41-NEXT:    movq %r9, 24(%r12)
 ; SSE41-NEXT:    sarq $63, %r9
 ; SSE41-NEXT:    xorq %r9, %rdx
@@ -3703,7 +3685,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; SSE41-NEXT:    negl %r15d
 ; SSE41-NEXT:    movd %r15d, %xmm0
 ; SSE41-NEXT:    pinsrd $1, %eax, %xmm0
-; SSE41-NEXT:    movq %r10, 16(%r12)
+; SSE41-NEXT:    movq %rbx, 16(%r12)
 ; SSE41-NEXT:    movq %rdi, (%r12)
 ; SSE41-NEXT:    popq %rbx
 ; SSE41-NEXT:    popq %r12
@@ -3722,33 +3704,31 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    pushq %r12
 ; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    movq %r8, %r14
-; AVX-NEXT:    movq %rcx, %rbp
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rsi, %r11
 ; AVX-NEXT:    movq %rdi, %r10
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX-NEXT:    movq %r11, %rbx
-; AVX-NEXT:    sarq $63, %rbx
-; AVX-NEXT:    movq %rbx, %r15
-; AVX-NEXT:    andq %r14, %r15
+; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT:    movq %r11, %r12
+; AVX-NEXT:    sarq $63, %r12
+; AVX-NEXT:    movq %r14, %rbx
+; AVX-NEXT:    imulq %r12, %rbx
 ; AVX-NEXT:    movq %r14, %rax
-; AVX-NEXT:    mulq %rbx
+; AVX-NEXT:    mulq %r12
 ; AVX-NEXT:    movq %rax, %rdi
-; AVX-NEXT:    movq %rdx, %r12
-; AVX-NEXT:    subq %r15, %r12
-; AVX-NEXT:    andq %r9, %rbx
-; AVX-NEXT:    subq %rbx, %r12
-; AVX-NEXT:    movq %r9, %r13
-; AVX-NEXT:    sarq $63, %r13
-; AVX-NEXT:    movq %r13, %rcx
-; AVX-NEXT:    andq %r11, %rcx
-; AVX-NEXT:    movq %r13, %rax
+; AVX-NEXT:    addq %rbx, %rdx
+; AVX-NEXT:    imulq %r9, %r12
+; AVX-NEXT:    addq %rdx, %r12
+; AVX-NEXT:    movq %r9, %rbx
+; AVX-NEXT:    sarq $63, %rbx
+; AVX-NEXT:    movq %rbx, %r13
+; AVX-NEXT:    imulq %r11, %r13
+; AVX-NEXT:    movq %rbx, %rax
 ; AVX-NEXT:    mulq %r10
 ; AVX-NEXT:    movq %rax, %r15
-; AVX-NEXT:    movq %rdx, %rbx
-; AVX-NEXT:    subq %rcx, %rbx
-; AVX-NEXT:    andq %r10, %r13
-; AVX-NEXT:    subq %r13, %rbx
+; AVX-NEXT:    addq %r13, %rdx
+; AVX-NEXT:    imulq %r10, %rbx
+; AVX-NEXT:    addq %rdx, %rbx
 ; AVX-NEXT:    addq %rdi, %r15
 ; AVX-NEXT:    adcq %r12, %rbx
 ; AVX-NEXT:    movq %r10, %rax
@@ -3768,11 +3748,11 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    addq %r13, %r10
 ; AVX-NEXT:    adcq %r14, %r12
 ; AVX-NEXT:    setb %al
-; AVX-NEXT:    movzbl %al, %ecx
+; AVX-NEXT:    movzbl %al, %r14d
 ; AVX-NEXT:    movq %r11, %rax
 ; AVX-NEXT:    mulq %r9
 ; AVX-NEXT:    addq %r12, %rax
-; AVX-NEXT:    adcq %rcx, %rdx
+; AVX-NEXT:    adcq %r14, %rdx
 ; AVX-NEXT:    addq %r15, %rax
 ; AVX-NEXT:    adcq %rbx, %rdx
 ; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r12
@@ -3783,56 +3763,52 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    xorl %r15d, %r15d
 ; AVX-NEXT:    orq %rdx, %r10
 ; AVX-NEXT:    setne %r15b
-; AVX-NEXT:    movq %rbp, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    movq %rcx, %r11
-; AVX-NEXT:    andq %rsi, %r11
+; AVX-NEXT:    movq %rcx, %rbx
+; AVX-NEXT:    sarq $63, %rbx
+; AVX-NEXT:    movq %rsi, %r10
+; AVX-NEXT:    imulq %rbx, %r10
 ; AVX-NEXT:    movq %rsi, %rax
-; AVX-NEXT:    mulq %rcx
+; AVX-NEXT:    mulq %rbx
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    movq %rdx, %r10
-; AVX-NEXT:    subq %r11, %r10
-; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    andq %rax, %rcx
-; AVX-NEXT:    subq %rcx, %r10
-; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    movq %rax, %r13
-; AVX-NEXT:    sarq $63, %r11
-; AVX-NEXT:    movq %r11, %rcx
-; AVX-NEXT:    andq %rbp, %rcx
-; AVX-NEXT:    movq %r11, %rax
+; AVX-NEXT:    addq %r10, %rdx
+; AVX-NEXT:    imulq %rbp, %rbx
+; AVX-NEXT:    addq %rdx, %rbx
+; AVX-NEXT:    movq %rbp, %r10
+; AVX-NEXT:    sarq $63, %r10
+; AVX-NEXT:    movq %r10, %r14
+; AVX-NEXT:    imulq %rcx, %r14
+; AVX-NEXT:    movq %r10, %rax
 ; AVX-NEXT:    mulq %r8
-; AVX-NEXT:    movq %rax, %rbx
-; AVX-NEXT:    movq %rdx, %r14
-; AVX-NEXT:    subq %rcx, %r14
-; AVX-NEXT:    andq %r8, %r11
-; AVX-NEXT:    subq %r11, %r14
-; AVX-NEXT:    addq %r9, %rbx
-; AVX-NEXT:    adcq %r10, %r14
+; AVX-NEXT:    movq %rax, %r11
+; AVX-NEXT:    addq %r14, %rdx
+; AVX-NEXT:    imulq %r8, %r10
+; AVX-NEXT:    addq %rdx, %r10
+; AVX-NEXT:    addq %r9, %r11
+; AVX-NEXT:    adcq %rbx, %r10
 ; AVX-NEXT:    movq %r8, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %r9
-; AVX-NEXT:    movq %rax, %r10
-; AVX-NEXT:    movq %rbp, %rax
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rcx, %rax
 ; AVX-NEXT:    mulq %rsi
 ; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    movq %rax, %r11
-; AVX-NEXT:    addq %r9, %r11
+; AVX-NEXT:    movq %rax, %r14
+; AVX-NEXT:    addq %r9, %r14
 ; AVX-NEXT:    adcq $0, %rsi
 ; AVX-NEXT:    movq %r8, %rax
-; AVX-NEXT:    mulq %r13
+; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    addq %r11, %r9
+; AVX-NEXT:    addq %r14, %r9
 ; AVX-NEXT:    adcq %rsi, %r8
 ; AVX-NEXT:    setb %al
-; AVX-NEXT:    movzbl %al, %ecx
-; AVX-NEXT:    movq %rbp, %rax
-; AVX-NEXT:    mulq %r13
+; AVX-NEXT:    movzbl %al, %esi
+; AVX-NEXT:    movq %rcx, %rax
+; AVX-NEXT:    mulq %rbp
 ; AVX-NEXT:    addq %r8, %rax
-; AVX-NEXT:    adcq %rcx, %rdx
-; AVX-NEXT:    addq %rbx, %rax
-; AVX-NEXT:    adcq %r14, %rdx
+; AVX-NEXT:    adcq %rsi, %rdx
+; AVX-NEXT:    addq %r11, %rax
+; AVX-NEXT:    adcq %r10, %rdx
 ; AVX-NEXT:    movq %r9, 24(%r12)
 ; AVX-NEXT:    sarq $63, %r9
 ; AVX-NEXT:    xorq %r9, %rdx
@@ -3844,7 +3820,7 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX-NEXT:    negl %r15d
 ; AVX-NEXT:    vmovd %r15d, %xmm0
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    movq %r10, 16(%r12)
+; AVX-NEXT:    movq %rbx, 16(%r12)
 ; AVX-NEXT:    movq %rdi, (%r12)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    popq %r12
@@ -3862,35 +3838,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512F-NEXT:    movq %r9, %rbp
 ; AVX512F-NEXT:    movq %rcx, %r11
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rsi, %rbp
-; AVX512F-NEXT:    movq %rdi, %r9
+; AVX512F-NEXT:    movq %rsi, %r9
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    movq %rcx, %rbx
-; AVX512F-NEXT:    sarq $63, %rbx
-; AVX512F-NEXT:    movq %rbx, %r14
-; AVX512F-NEXT:    andq %r15, %r14
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT:    movq %rcx, %r12
+; AVX512F-NEXT:    sarq $63, %r12
+; AVX512F-NEXT:    movq %r15, %rbx
+; AVX512F-NEXT:    imulq %r12, %rbx
 ; AVX512F-NEXT:    movq %r15, %rax
-; AVX512F-NEXT:    mulq %rbx
+; AVX512F-NEXT:    mulq %r12
 ; AVX512F-NEXT:    movq %rax, %rcx
-; AVX512F-NEXT:    movq %rdx, %r12
-; AVX512F-NEXT:    subq %r14, %r12
-; AVX512F-NEXT:    andq %rdi, %rbx
-; AVX512F-NEXT:    subq %rbx, %r12
-; AVX512F-NEXT:    movq %rdi, %r13
-; AVX512F-NEXT:    sarq $63, %r13
-; AVX512F-NEXT:    movq %r13, %rsi
-; AVX512F-NEXT:    andq %r11, %rsi
-; AVX512F-NEXT:    movq %r13, %rax
+; AVX512F-NEXT:    addq %rbx, %rdx
+; AVX512F-NEXT:    imulq %rsi, %r12
+; AVX512F-NEXT:    addq %rdx, %r12
+; AVX512F-NEXT:    movq %rsi, %rbx
+; AVX512F-NEXT:    sarq $63, %rbx
+; AVX512F-NEXT:    movq %rbx, %r13
+; AVX512F-NEXT:    imulq %r11, %r13
+; AVX512F-NEXT:    movq %rbx, %rax
 ; AVX512F-NEXT:    mulq %r10
 ; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    movq %rdx, %rbx
-; AVX512F-NEXT:    subq %rsi, %rbx
-; AVX512F-NEXT:    andq %r10, %r13
-; AVX512F-NEXT:    subq %r13, %rbx
+; AVX512F-NEXT:    addq %r13, %rdx
+; AVX512F-NEXT:    imulq %r10, %rbx
+; AVX512F-NEXT:    addq %rdx, %rbx
 ; AVX512F-NEXT:    addq %rcx, %r14
 ; AVX512F-NEXT:    adcq %r12, %rbx
 ; AVX512F-NEXT:    movq %r10, %rax
@@ -3904,78 +3877,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    addq %r12, %r13
 ; AVX512F-NEXT:    adcq $0, %r15
 ; AVX512F-NEXT:    movq %r10, %rax
-; AVX512F-NEXT:    mulq %rdi
+; AVX512F-NEXT:    mulq %rsi
 ; AVX512F-NEXT:    movq %rdx, %r12
 ; AVX512F-NEXT:    movq %rax, %r10
 ; AVX512F-NEXT:    addq %r13, %r10
 ; AVX512F-NEXT:    adcq %r15, %r12
 ; AVX512F-NEXT:    setb %al
-; AVX512F-NEXT:    movzbl %al, %esi
+; AVX512F-NEXT:    movzbl %al, %r15d
 ; AVX512F-NEXT:    movq %r11, %rax
-; AVX512F-NEXT:    mulq %rdi
+; AVX512F-NEXT:    mulq %rsi
 ; AVX512F-NEXT:    addq %r12, %rax
-; AVX512F-NEXT:    adcq %rsi, %rdx
+; AVX512F-NEXT:    adcq %r15, %rdx
 ; AVX512F-NEXT:    addq %r14, %rax
 ; AVX512F-NEXT:    adcq %rbx, %rdx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512F-NEXT:    movq %r10, 24(%r13)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT:    movq %r10, 24(%r12)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
 ; AVX512F-NEXT:    orq %rdx, %r10
 ; AVX512F-NEXT:    setne %al
 ; AVX512F-NEXT:    kmovw %eax, %k0
-; AVX512F-NEXT:    movq %rbp, %rsi
+; AVX512F-NEXT:    movq %r9, %rsi
 ; AVX512F-NEXT:    sarq $63, %rsi
-; AVX512F-NEXT:    movq %rsi, %rdi
-; AVX512F-NEXT:    andq %r8, %rdi
+; AVX512F-NEXT:    movq %r8, %r11
+; AVX512F-NEXT:    imulq %rsi, %r11
 ; AVX512F-NEXT:    movq %r8, %rax
 ; AVX512F-NEXT:    mulq %rsi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    movq %rdx, %r11
-; AVX512F-NEXT:    subq %rdi, %r11
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT:    andq %rax, %rsi
-; AVX512F-NEXT:    subq %rsi, %r11
+; AVX512F-NEXT:    addq %r11, %rdx
+; AVX512F-NEXT:    imulq %rbp, %rsi
+; AVX512F-NEXT:    addq %rdx, %rsi
+; AVX512F-NEXT:    movq %rbp, %r11
+; AVX512F-NEXT:    sarq $63, %r11
+; AVX512F-NEXT:    movq %r11, %r14
+; AVX512F-NEXT:    imulq %r9, %r14
+; AVX512F-NEXT:    movq %r11, %rax
+; AVX512F-NEXT:    mulq %rdi
 ; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    movq %rax, %r12
-; AVX512F-NEXT:    sarq $63, %rbx
-; AVX512F-NEXT:    movq %rbx, %rsi
-; AVX512F-NEXT:    andq %rbp, %rsi
-; AVX512F-NEXT:    movq %rbx, %rax
-; AVX512F-NEXT:    mulq %r9
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    movq %rdx, %r15
-; AVX512F-NEXT:    subq %rsi, %r15
-; AVX512F-NEXT:    andq %r9, %rbx
-; AVX512F-NEXT:    subq %rbx, %r15
-; AVX512F-NEXT:    addq %r10, %r14
-; AVX512F-NEXT:    adcq %r11, %r15
-; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    addq %r14, %rdx
+; AVX512F-NEXT:    imulq %rdi, %r11
+; AVX512F-NEXT:    addq %rdx, %r11
+; AVX512F-NEXT:    addq %r10, %rbx
+; AVX512F-NEXT:    adcq %rsi, %r11
+; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r10
-; AVX512F-NEXT:    movq %rax, %r11
-; AVX512F-NEXT:    movq %rbp, %rax
+; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    movq %r9, %rax
 ; AVX512F-NEXT:    mulq %r8
 ; AVX512F-NEXT:    movq %rdx, %r8
-; AVX512F-NEXT:    movq %rax, %rbx
-; AVX512F-NEXT:    addq %r10, %rbx
+; AVX512F-NEXT:    movq %rax, %r15
+; AVX512F-NEXT:    addq %r10, %r15
 ; AVX512F-NEXT:    adcq $0, %r8
-; AVX512F-NEXT:    movq %r9, %rax
-; AVX512F-NEXT:    mulq %r12
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    mulq %rbp
 ; AVX512F-NEXT:    movq %rdx, %rdi
 ; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    addq %rbx, %r10
+; AVX512F-NEXT:    addq %r15, %r10
 ; AVX512F-NEXT:    adcq %r8, %rdi
 ; AVX512F-NEXT:    setb %al
 ; AVX512F-NEXT:    movzbl %al, %esi
-; AVX512F-NEXT:    movq %rbp, %rax
-; AVX512F-NEXT:    mulq %r12
+; AVX512F-NEXT:    movq %r9, %rax
+; AVX512F-NEXT:    mulq %rbp
 ; AVX512F-NEXT:    addq %rdi, %rax
 ; AVX512F-NEXT:    adcq %rsi, %rdx
-; AVX512F-NEXT:    addq %r14, %rax
-; AVX512F-NEXT:    adcq %r15, %rdx
-; AVX512F-NEXT:    movq %r10, 8(%r13)
+; AVX512F-NEXT:    addq %rbx, %rax
+; AVX512F-NEXT:    adcq %r11, %rdx
+; AVX512F-NEXT:    movq %r10, 8(%r12)
 ; AVX512F-NEXT:    sarq $63, %r10
 ; AVX512F-NEXT:    xorq %r10, %rdx
 ; AVX512F-NEXT:    xorq %rax, %r10
@@ -3987,8 +3956,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512F-NEXT:    korw %k0, %k1, %k1
 ; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT:    movq %rcx, 16(%r13)
-; AVX512F-NEXT:    movq %r11, (%r13)
+; AVX512F-NEXT:    movq %rcx, 16(%r12)
+; AVX512F-NEXT:    movq %r14, (%r12)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
 ; AVX512F-NEXT:    popq %r13
@@ -4005,35 +3974,32 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    pushq %r13
 ; AVX512BW-NEXT:    pushq %r12
 ; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT:    movq %r9, %rbp
 ; AVX512BW-NEXT:    movq %rcx, %r11
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rsi, %rbp
-; AVX512BW-NEXT:    movq %rdi, %r9
+; AVX512BW-NEXT:    movq %rsi, %r9
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-NEXT:    movq %rcx, %rbx
-; AVX512BW-NEXT:    sarq $63, %rbx
-; AVX512BW-NEXT:    movq %rbx, %r14
-; AVX512BW-NEXT:    andq %r15, %r14
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512BW-NEXT:    movq %rcx, %r12
+; AVX512BW-NEXT:    sarq $63, %r12
+; AVX512BW-NEXT:    movq %r15, %rbx
+; AVX512BW-NEXT:    imulq %r12, %rbx
 ; AVX512BW-NEXT:    movq %r15, %rax
-; AVX512BW-NEXT:    mulq %rbx
+; AVX512BW-NEXT:    mulq %r12
 ; AVX512BW-NEXT:    movq %rax, %rcx
-; AVX512BW-NEXT:    movq %rdx, %r12
-; AVX512BW-NEXT:    subq %r14, %r12
-; AVX512BW-NEXT:    andq %rdi, %rbx
-; AVX512BW-NEXT:    subq %rbx, %r12
-; AVX512BW-NEXT:    movq %rdi, %r13
-; AVX512BW-NEXT:    sarq $63, %r13
-; AVX512BW-NEXT:    movq %r13, %rsi
-; AVX512BW-NEXT:    andq %r11, %rsi
-; AVX512BW-NEXT:    movq %r13, %rax
+; AVX512BW-NEXT:    addq %rbx, %rdx
+; AVX512BW-NEXT:    imulq %rsi, %r12
+; AVX512BW-NEXT:    addq %rdx, %r12
+; AVX512BW-NEXT:    movq %rsi, %rbx
+; AVX512BW-NEXT:    sarq $63, %rbx
+; AVX512BW-NEXT:    movq %rbx, %r13
+; AVX512BW-NEXT:    imulq %r11, %r13
+; AVX512BW-NEXT:    movq %rbx, %rax
 ; AVX512BW-NEXT:    mulq %r10
 ; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    movq %rdx, %rbx
-; AVX512BW-NEXT:    subq %rsi, %rbx
-; AVX512BW-NEXT:    andq %r10, %r13
-; AVX512BW-NEXT:    subq %r13, %rbx
+; AVX512BW-NEXT:    addq %r13, %rdx
+; AVX512BW-NEXT:    imulq %r10, %rbx
+; AVX512BW-NEXT:    addq %rdx, %rbx
 ; AVX512BW-NEXT:    addq %rcx, %r14
 ; AVX512BW-NEXT:    adcq %r12, %rbx
 ; AVX512BW-NEXT:    movq %r10, %rax
@@ -4047,78 +4013,74 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    addq %r12, %r13
 ; AVX512BW-NEXT:    adcq $0, %r15
 ; AVX512BW-NEXT:    movq %r10, %rax
-; AVX512BW-NEXT:    mulq %rdi
+; AVX512BW-NEXT:    mulq %rsi
 ; AVX512BW-NEXT:    movq %rdx, %r12
 ; AVX512BW-NEXT:    movq %rax, %r10
 ; AVX512BW-NEXT:    addq %r13, %r10
 ; AVX512BW-NEXT:    adcq %r15, %r12
 ; AVX512BW-NEXT:    setb %al
-; AVX512BW-NEXT:    movzbl %al, %esi
+; AVX512BW-NEXT:    movzbl %al, %r15d
 ; AVX512BW-NEXT:    movq %r11, %rax
-; AVX512BW-NEXT:    mulq %rdi
+; AVX512BW-NEXT:    mulq %rsi
 ; AVX512BW-NEXT:    addq %r12, %rax
-; AVX512BW-NEXT:    adcq %rsi, %rdx
+; AVX512BW-NEXT:    adcq %r15, %rdx
 ; AVX512BW-NEXT:    addq %r14, %rax
 ; AVX512BW-NEXT:    adcq %rbx, %rdx
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512BW-NEXT:    movq %r10, 24(%r13)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT:    movq %r10, 24(%r12)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
 ; AVX512BW-NEXT:    orq %rdx, %r10
 ; AVX512BW-NEXT:    setne %al
 ; AVX512BW-NEXT:    kmovd %eax, %k0
-; AVX512BW-NEXT:    movq %rbp, %rsi
+; AVX512BW-NEXT:    movq %r9, %rsi
 ; AVX512BW-NEXT:    sarq $63, %rsi
-; AVX512BW-NEXT:    movq %rsi, %rdi
-; AVX512BW-NEXT:    andq %r8, %rdi
+; AVX512BW-NEXT:    movq %r8, %r11
+; AVX512BW-NEXT:    imulq %rsi, %r11
 ; AVX512BW-NEXT:    movq %r8, %rax
 ; AVX512BW-NEXT:    mulq %rsi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    movq %rdx, %r11
-; AVX512BW-NEXT:    subq %rdi, %r11
-; AVX512BW-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512BW-NEXT:    andq %rax, %rsi
-; AVX512BW-NEXT:    subq %rsi, %r11
+; AVX512BW-NEXT:    addq %r11, %rdx
+; AVX512BW-NEXT:    imulq %rbp, %rsi
+; AVX512BW-NEXT:    addq %rdx, %rsi
+; AVX512BW-NEXT:    movq %rbp, %r11
+; AVX512BW-NEXT:    sarq $63, %r11
+; AVX512BW-NEXT:    movq %r11, %r14
+; AVX512BW-NEXT:    imulq %r9, %r14
+; AVX512BW-NEXT:    movq %r11, %rax
+; AVX512BW-NEXT:    mulq %rdi
 ; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    movq %rax, %r12
-; AVX512BW-NEXT:    sarq $63, %rbx
-; AVX512BW-NEXT:    movq %rbx, %rsi
-; AVX512BW-NEXT:    andq %rbp, %rsi
-; AVX512BW-NEXT:    movq %rbx, %rax
-; AVX512BW-NEXT:    mulq %r9
-; AVX512BW-NEXT:    movq %rax, %r14
-; AVX512BW-NEXT:    movq %rdx, %r15
-; AVX512BW-NEXT:    subq %rsi, %r15
-; AVX512BW-NEXT:    andq %r9, %rbx
-; AVX512BW-NEXT:    subq %rbx, %r15
-; AVX512BW-NEXT:    addq %r10, %r14
-; AVX512BW-NEXT:    adcq %r11, %r15
-; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    addq %r14, %rdx
+; AVX512BW-NEXT:    imulq %rdi, %r11
+; AVX512BW-NEXT:    addq %rdx, %r11
+; AVX512BW-NEXT:    addq %r10, %rbx
+; AVX512BW-NEXT:    adcq %rsi, %r11
+; AVX512BW-NEXT:    movq %rdi, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r10
-; AVX512BW-NEXT:    movq %rax, %r11
-; AVX512BW-NEXT:    movq %rbp, %rax
+; AVX512BW-NEXT:    movq %rax, %r14
+; AVX512BW-NEXT:    movq %r9, %rax
 ; AVX512BW-NEXT:    mulq %r8
 ; AVX512BW-NEXT:    movq %rdx, %r8
-; AVX512BW-NEXT:    movq %rax, %rbx
-; AVX512BW-NEXT:    addq %r10, %rbx
+; AVX512BW-NEXT:    movq %rax, %r15
+; AVX512BW-NEXT:    addq %r10, %r15
 ; AVX512BW-NEXT:    adcq $0, %r8
-; AVX512BW-NEXT:    movq %r9, %rax
-; AVX512BW-NEXT:    mulq %r12
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    mulq %rbp
 ; AVX512BW-NEXT:    movq %rdx, %rdi
 ; AVX512BW-NEXT:    movq %rax, %r10
-; AVX512BW-NEXT:    addq %rbx, %r10
+; AVX512BW-NEXT:    addq %r15, %r10
 ; AVX512BW-NEXT:    adcq %r8, %rdi
 ; AVX512BW-NEXT:    setb %al
 ; AVX512BW-NEXT:    movzbl %al, %esi
-; AVX512BW-NEXT:    movq %rbp, %rax
-; AVX512BW-NEXT:    mulq %r12
+; AVX512BW-NEXT:    movq %r9, %rax
+; AVX512BW-NEXT:    mulq %rbp
 ; AVX512BW-NEXT:    addq %rdi, %rax
 ; AVX512BW-NEXT:    adcq %rsi, %rdx
-; AVX512BW-NEXT:    addq %r14, %rax
-; AVX512BW-NEXT:    adcq %r15, %rdx
-; AVX512BW-NEXT:    movq %r10, 8(%r13)
+; AVX512BW-NEXT:    addq %rbx, %rax
+; AVX512BW-NEXT:    adcq %r11, %rdx
+; AVX512BW-NEXT:    movq %r10, 8(%r12)
 ; AVX512BW-NEXT:    sarq $63, %r10
 ; AVX512BW-NEXT:    xorq %r10, %rdx
 ; AVX512BW-NEXT:    xorq %rax, %r10
@@ -4130,8 +4092,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; AVX512BW-NEXT:    korw %k0, %k1, %k1
 ; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    movq %rcx, 16(%r13)
-; AVX512BW-NEXT:    movq %r11, (%r13)
+; AVX512BW-NEXT:    movq %rcx, 16(%r12)
+; AVX512BW-NEXT:    movq %r14, (%r12)
 ; AVX512BW-NEXT:    popq %rbx
 ; AVX512BW-NEXT:    popq %r12
 ; AVX512BW-NEXT:    popq %r13

diff  --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 508b0d7fe0f2b..4adc80b3b8bd6 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -215,36 +215,35 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    andl %eax, %edi
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebp, %edx
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    addl %ebx, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    subl %edi, %ecx
-; WIN32-NEXT:    andl %ebp, %esi
-; WIN32-NEXT:    subl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    andl %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    subl %edi, %esi
-; WIN32-NEXT:    andl %ebx, %ebp
-; WIN32-NEXT:    subl %ebp, %esi
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -263,7 +262,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    addl %edi, %eax
 ; WIN32-NEXT:    movzbl %cl, %ecx
 ; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    adcl %esi, %edx
 ; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
@@ -272,7 +271,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %ebp, 4(%eax)
-; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    addl $8, %esp
@@ -574,52 +573,49 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    movl %eax, %esi
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    andl %eax, %edi
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    imull %ecx, %edi
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    subl %edi, %esi
-; WIN32-NEXT:    andl %ebx, %ecx
-; WIN32-NEXT:    subl %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    andl %ebp, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    subl %edi, %ebx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    andl %ebp, %ecx
-; WIN32-NEXT:    subl %ecx, %ebx
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    imull %esi, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    adcl %ecx, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %esi, %ecx
-; WIN32-NEXT:    adcl $0, %ebp
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    adcl $0, %ecx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ecx, %esi
-; WIN32-NEXT:    adcl %ebp, %edi
+; WIN32-NEXT:    addl %ebp, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    adcl %ecx, %edi
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
@@ -1003,32 +999,30 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    addl %edx, %edi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    andl %eax, %edi
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    subl %edi, %ecx
-; WIN32-NEXT:    andl %ebx, %esi
-; WIN32-NEXT:    subl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    andl %ebp, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    subl %edi, %esi
-; WIN32-NEXT:    andl %ebp, %ebx
-; WIN32-NEXT:    subl %ebx, %esi
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %esi, %ebx
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebx, %edx
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebx
@@ -1710,62 +1704,57 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %esi
-; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    andl %eax, %ecx
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl 4(%eax), %ebp
+; WIN32-NEXT:    sarl $31, %ebx
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    imull %ebp, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    subl %ecx, %ebp
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    addl %ecx, %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    andl %esi, %edi
-; WIN32-NEXT:    subl %edi, %ebp
-; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    andl %eax, %ebx
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ecx, %edi
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    subl %ebx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    andl %edx, %ecx
-; WIN32-NEXT:    subl %ecx, %edi
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ebp, %edi
+; WIN32-NEXT:    adcl %ebx, %ecx
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl %ebx, %edi
 ; WIN32-NEXT:    adcl $0, %ebp
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %ebx, %esi
-; WIN32-NEXT:    adcl %ebp, %ecx
-; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    adcl %ebp, %ebx
+; WIN32-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    movzbl %bl, %ecx
-; WIN32-NEXT:    adcl %ecx, %edx
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ecx, %edx
 ; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
@@ -1773,7 +1762,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    orl %edx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    addl $16, %esp
@@ -1821,35 +1810,35 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl (%eax), %ebp
 ; WIN32-NEXT:    movl 4(%eax), %ebx
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    andl %ebp, %edi
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    imull %edi, %esi
 ; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    subl %edi, %ecx
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    andl %ebx, %esi
-; WIN32-NEXT:    subl %esi, %ecx
-; WIN32-NEXT:    sarl $31, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    subl %edi, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    andl %edx, %ebx
-; WIN32-NEXT:    subl %ebx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %esi, %ebx
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebx, %edx
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    addl %edx, %esi
 ; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ecx, %esi
-; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    adcl %edi, %esi
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill


        


More information about the llvm-commits mailing list