[llvm] [TargetLowering] Improve one signature of forceExpandWideMUL. (PR #123991)
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 22 11:23:18 PST 2025
https://github.com/topperc created https://github.com/llvm/llvm-project/pull/123991
We have two forceExpandWideMUL functions. One takes the low and half half of 2 inputs and calculates the low and high half of their product. This does not calculate the full 2x width product.
The other signature takes 2 inputs and calculates the low and high half of their full 2x width product. Previously it did this by sign/zero extending the inputs to create the high bits and then calling the other function.
We can instead copy the algorithm from the other function and use the Signed flag to determine whether we should do SRA or SRL. This avoids the need to multiply the high part of the inputs and add them to the high half of the result. This improves the generated code for signed multiplication.
This should improve the performance of #123262. I don't know yet how close we will get to gcc.
>From d54e83030c3768bddf99ce16f5285737b82ee1f2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 22 Jan 2025 10:46:24 -0800
Subject: [PATCH] [TargetLowering] Improve one signature of forceExpandWideMUL.
We have two forceExpandWideMUL functions. One takes the low and
half half of 2 inputs and calculates the low and high half of
their product. This does not calculate the full 2x width product.
The other signature takes 2 inputs and calculates the low and high
half of their full 2x width product. Previously it did this by
sign/zero extending the inputs to create the high bits and then
calling the other function.
We can instead copy the algorithm from the other function and
use the Signed flag to determine whether we should do SRA or SRL.
This avoids the need to multiply the high part of the inputs and
add them to the high half of the result. This improves the generated
code for signed multiplication.
This should improve the performance of #123262. I don't know yet
how close we will get to gcc.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 79 +-
llvm/test/CodeGen/AArch64/i128-math.ll | 178 +-
.../umulo-128-legalisation-lowering.ll | 54 +-
.../CodeGen/LoongArch/smul-with-overflow.ll | 627 +++---
llvm/test/CodeGen/RISCV/xaluo.ll | 973 ++++-----
.../SPARC/smulo-128-legalisation-lowering.ll | 342 ++--
llvm/test/CodeGen/Thumb/smul_fix.ll | 265 +--
llvm/test/CodeGen/Thumb/smul_fix_sat.ll | 461 ++---
llvm/test/CodeGen/Thumb/umul_fix.ll | 346 ++--
llvm/test/CodeGen/Thumb/umul_fix_sat.ll | 247 +--
llvm/test/CodeGen/X86/muloti.ll | 79 +-
llvm/test/CodeGen/X86/smul-with-overflow.ll | 1078 +++++-----
llvm/test/CodeGen/X86/smul_fix_sat.ll | 93 +-
.../X86/smulo-128-legalisation-lowering.ll | 1793 ++++++++---------
llvm/test/CodeGen/X86/vec_smulo.ll | 1012 ++++------
llvm/test/CodeGen/X86/xmulo.ll | 388 ++--
16 files changed, 3369 insertions(+), 4646 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5861a95c090b1d..35b5226591b17c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10952,22 +10952,73 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
SDValue &Hi) const {
EVT VT = LHS.getValueType();
assert(RHS.getValueType() == VT && "Mismatching operand types");
+ EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+ // We can fall back to a libcall with an illegal type for the MUL if we
+ // have a libcall big enough.
+ // Also, we can fall back to a division in some cases, but that's a big
+ // performance hit in the general case.
+ RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+ if (WideVT == MVT::i16)
+ LC = RTLIB::MUL_I16;
+ else if (WideVT == MVT::i32)
+ LC = RTLIB::MUL_I32;
+ else if (WideVT == MVT::i64)
+ LC = RTLIB::MUL_I64;
+ else if (WideVT == MVT::i128)
+ LC = RTLIB::MUL_I128;
- SDValue HiLHS;
- SDValue HiRHS;
- if (Signed) {
- // The high part is obtained by SRA'ing all but one of the bits of low
- // part.
- unsigned LoSize = VT.getFixedSizeInBits();
- SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
- HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
- HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
- } else {
- HiLHS = DAG.getConstant(0, dl, VT);
- HiRHS = DAG.getConstant(0, dl, VT);
+ if (LC != RTLIB::UNKNOWN_LIBCALL && getLibcallName(LC)) {
+ SDValue HiLHS, HiRHS;
+ if (Signed) {
+ // The high part is obtained by SRA'ing all but one of the bits of low
+ // part.
+ unsigned LoSize = VT.getFixedSizeInBits();
+ SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
+ HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
+ HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
+ } else {
+ HiLHS = DAG.getConstant(0, dl, VT);
+ HiRHS = DAG.getConstant(0, dl, VT);
+ }
+ forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
+ return;
}
- EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
- forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
+
+ // Expand the multiplication by brute force. This is a generalized-version of
+ // the code from Hacker's Delight (itself derived from Knuth's Algorithm M
+ // from section 4.3.1) combined with the Hacker's delight code
+ // for calculating mulhs.
+ unsigned Bits = VT.getSizeInBits();
+ unsigned HalfBits = Bits / 2;
+ SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
+ SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
+ SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask);
+
+ SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL);
+ SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
+
+ SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
+ // This is always an unsigned shift.
+ SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
+
+ unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL;
+ SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift);
+ SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift);
+
+ SDValue U =
+ DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH);
+ SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
+ SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift);
+
+ SDValue V =
+ DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL);
+ SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift);
+
+ Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
+ DAG.getNode(ISD::SHL, dl, VT, V, Shift));
+
+ Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH),
+ DAG.getNode(ISD::ADD, dl, VT, UH, VH));
}
SDValue
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9ae906249826d3..9e1c0c1b115ab6 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -355,40 +355,32 @@ define i128 @i128_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_checked_mul:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr x8, x1, #63
-; CHECK-NEXT: asr x11, x3, #63
-; CHECK-NEXT: umulh x13, x0, x2
-; CHECK-NEXT: mul x9, x2, x8
-; CHECK-NEXT: umulh x10, x2, x8
-; CHECK-NEXT: umulh x12, x11, x0
-; CHECK-NEXT: mul x14, x1, x2
-; CHECK-NEXT: add x10, x10, x9
-; CHECK-NEXT: madd x8, x3, x8, x10
-; CHECK-NEXT: madd x10, x11, x1, x12
-; CHECK-NEXT: mul x11, x11, x0
-; CHECK-NEXT: umulh x12, x1, x2
-; CHECK-NEXT: mul x15, x0, x3
-; CHECK-NEXT: add x10, x10, x11
-; CHECK-NEXT: adds x9, x11, x9
-; CHECK-NEXT: umulh x16, x0, x3
-; CHECK-NEXT: adc x10, x10, x8
-; CHECK-NEXT: adds x8, x14, x13
-; CHECK-NEXT: cinc x12, x12, hs
-; CHECK-NEXT: mul x11, x1, x3
-; CHECK-NEXT: adds x8, x15, x8
-; CHECK-NEXT: umulh x13, x1, x3
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x9, x8, x9
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: adds x8, x14, x10
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: smulh x10, x1, x3
; CHECK-NEXT: mov x1, x8
-; CHECK-NEXT: cinc x14, x16, hs
-; CHECK-NEXT: adds x12, x12, x14
+; CHECK-NEXT: adc x11, x12, x13
+; CHECK-NEXT: asr x12, x9, #63
+; CHECK-NEXT: asr x13, x11, #63
+; CHECK-NEXT: adds x9, x9, x11
+; CHECK-NEXT: asr x11, x8, #63
; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: cset w14, hs
-; CHECK-NEXT: adds x11, x11, x12
-; CHECK-NEXT: asr x12, x8, #63
-; CHECK-NEXT: adc x13, x13, x14
-; CHECK-NEXT: adds x9, x11, x9
-; CHECK-NEXT: adc x10, x13, x10
-; CHECK-NEXT: cmp x9, x12
-; CHECK-NEXT: ccmp x10, x12, #0, eq
+; CHECK-NEXT: adc x12, x12, x13
+; CHECK-NEXT: adds x9, x15, x9
+; CHECK-NEXT: adc x10, x10, x12
+; CHECK-NEXT: cmp x9, x11
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w2, eq
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
@@ -404,40 +396,32 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_overflowing_mul:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr x8, x1, #63
-; CHECK-NEXT: asr x11, x3, #63
-; CHECK-NEXT: umulh x13, x0, x2
-; CHECK-NEXT: mul x9, x2, x8
-; CHECK-NEXT: umulh x10, x2, x8
-; CHECK-NEXT: umulh x12, x11, x0
-; CHECK-NEXT: mul x14, x1, x2
-; CHECK-NEXT: add x10, x10, x9
-; CHECK-NEXT: madd x8, x3, x8, x10
-; CHECK-NEXT: madd x10, x11, x1, x12
-; CHECK-NEXT: mul x11, x11, x0
-; CHECK-NEXT: umulh x12, x1, x2
-; CHECK-NEXT: mul x15, x0, x3
-; CHECK-NEXT: add x10, x10, x11
-; CHECK-NEXT: adds x9, x11, x9
-; CHECK-NEXT: umulh x16, x0, x3
-; CHECK-NEXT: adc x10, x10, x8
-; CHECK-NEXT: adds x8, x14, x13
-; CHECK-NEXT: cinc x12, x12, hs
-; CHECK-NEXT: mul x11, x1, x3
-; CHECK-NEXT: adds x8, x15, x8
-; CHECK-NEXT: umulh x13, x1, x3
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x9, x8, x9
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: adds x8, x14, x10
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: smulh x10, x1, x3
; CHECK-NEXT: mov x1, x8
-; CHECK-NEXT: cinc x14, x16, hs
-; CHECK-NEXT: adds x12, x12, x14
+; CHECK-NEXT: adc x11, x12, x13
+; CHECK-NEXT: asr x12, x9, #63
+; CHECK-NEXT: asr x13, x11, #63
+; CHECK-NEXT: adds x9, x9, x11
+; CHECK-NEXT: asr x11, x8, #63
; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: cset w14, hs
-; CHECK-NEXT: adds x11, x11, x12
-; CHECK-NEXT: asr x12, x8, #63
-; CHECK-NEXT: adc x13, x13, x14
-; CHECK-NEXT: adds x9, x11, x9
-; CHECK-NEXT: adc x10, x13, x10
-; CHECK-NEXT: cmp x9, x12
-; CHECK-NEXT: ccmp x10, x12, #0, eq
+; CHECK-NEXT: adc x12, x12, x13
+; CHECK-NEXT: adds x9, x15, x9
+; CHECK-NEXT: adc x10, x10, x12
+; CHECK-NEXT: cmp x9, x11
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w2, ne
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
@@ -452,46 +436,38 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
define i128 @i128_saturating_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_saturating_mul:
; CHECK: // %bb.0:
-; CHECK-NEXT: asr x8, x1, #63
-; CHECK-NEXT: asr x11, x3, #63
-; CHECK-NEXT: umulh x13, x0, x2
-; CHECK-NEXT: mul x9, x2, x8
-; CHECK-NEXT: umulh x10, x2, x8
-; CHECK-NEXT: umulh x12, x11, x0
-; CHECK-NEXT: mul x14, x1, x2
-; CHECK-NEXT: add x10, x10, x9
-; CHECK-NEXT: madd x8, x3, x8, x10
-; CHECK-NEXT: madd x10, x11, x1, x12
-; CHECK-NEXT: mul x11, x11, x0
-; CHECK-NEXT: umulh x12, x1, x2
-; CHECK-NEXT: mul x16, x0, x3
-; CHECK-NEXT: add x10, x10, x11
-; CHECK-NEXT: adds x9, x11, x9
-; CHECK-NEXT: umulh x15, x0, x3
-; CHECK-NEXT: adc x8, x10, x8
-; CHECK-NEXT: adds x10, x14, x13
-; CHECK-NEXT: cinc x12, x12, hs
-; CHECK-NEXT: mul x17, x1, x3
-; CHECK-NEXT: adds x10, x16, x10
-; CHECK-NEXT: umulh x11, x1, x3
-; CHECK-NEXT: cinc x13, x15, hs
-; CHECK-NEXT: adds x12, x12, x13
-; CHECK-NEXT: cset w13, hs
-; CHECK-NEXT: adds x12, x17, x12
-; CHECK-NEXT: adc x11, x11, x13
-; CHECK-NEXT: adds x9, x12, x9
-; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: adds x9, x14, x10
+; CHECK-NEXT: mul x11, x1, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: smulh x12, x1, x3
+; CHECK-NEXT: asr x13, x8, #63
+; CHECK-NEXT: asr x14, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: adc x10, x13, x14
+; CHECK-NEXT: adds x8, x11, x8
+; CHECK-NEXT: asr x11, x9, #63
; CHECK-NEXT: mul x13, x0, x2
-; CHECK-NEXT: adc x8, x11, x8
-; CHECK-NEXT: eor x11, x3, x1
-; CHECK-NEXT: eor x8, x8, x12
-; CHECK-NEXT: eor x9, x9, x12
-; CHECK-NEXT: asr x11, x11, #63
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: eor x9, x11, #0x7fffffffffffffff
+; CHECK-NEXT: adc x10, x12, x10
+; CHECK-NEXT: eor x12, x3, x1
+; CHECK-NEXT: eor x8, x8, x11
+; CHECK-NEXT: eor x10, x10, x11
+; CHECK-NEXT: asr x11, x12, #63
+; CHECK-NEXT: orr x8, x8, x10
+; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x1, x9, x10, ne
; CHECK-NEXT: csinv x0, x13, x11, eq
+; CHECK-NEXT: csel x1, x10, x9, ne
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index 08045e814a35ef..edfd80b4f27061 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -35,41 +35,33 @@ start:
define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
; AARCH-LABEL: __muloti4:
; AARCH: // %bb.0: // %Entry
-; AARCH-NEXT: asr x10, x1, #63
+; AARCH-NEXT: asr x11, x1, #63
; AARCH-NEXT: asr x9, x3, #63
-; AARCH-NEXT: umulh x14, x0, x2
+; AARCH-NEXT: umulh x12, x0, x2
; AARCH-NEXT: mov x8, x1
; AARCH-NEXT: str wzr, [x4]
-; AARCH-NEXT: mul x12, x2, x10
-; AARCH-NEXT: umulh x13, x2, x10
-; AARCH-NEXT: umulh x11, x9, x0
-; AARCH-NEXT: mul x15, x1, x2
-; AARCH-NEXT: add x13, x13, x12
-; AARCH-NEXT: madd x11, x9, x1, x11
-; AARCH-NEXT: mul x9, x9, x0
-; AARCH-NEXT: madd x10, x3, x10, x13
-; AARCH-NEXT: umulh x13, x1, x2
-; AARCH-NEXT: add x11, x11, x9
-; AARCH-NEXT: adds x9, x9, x12
-; AARCH-NEXT: mul x16, x0, x3
-; AARCH-NEXT: adc x10, x11, x10
-; AARCH-NEXT: adds x11, x15, x14
-; AARCH-NEXT: umulh x17, x0, x3
-; AARCH-NEXT: cinc x13, x13, hs
-; AARCH-NEXT: mul x12, x1, x3
-; AARCH-NEXT: adds x1, x16, x11
-; AARCH-NEXT: umulh x11, x8, x3
-; AARCH-NEXT: cinc x14, x17, hs
-; AARCH-NEXT: adds x13, x13, x14
+; AARCH-NEXT: mul x13, x1, x2
+; AARCH-NEXT: umulh x10, x1, x2
+; AARCH-NEXT: mul x11, x11, x2
+; AARCH-NEXT: adds x12, x13, x12
+; AARCH-NEXT: mul x15, x0, x3
+; AARCH-NEXT: umulh x14, x0, x3
+; AARCH-NEXT: adc x10, x10, x11
+; AARCH-NEXT: mul x9, x0, x9
+; AARCH-NEXT: mul x16, x1, x3
+; AARCH-NEXT: adds x1, x15, x12
+; AARCH-NEXT: asr x12, x10, #63
+; AARCH-NEXT: smulh x11, x8, x3
+; AARCH-NEXT: adc x9, x14, x9
+; AARCH-NEXT: asr x13, x9, #63
+; AARCH-NEXT: adds x9, x10, x9
+; AARCH-NEXT: asr x10, x1, #63
; AARCH-NEXT: mul x0, x0, x2
-; AARCH-NEXT: cset w14, hs
-; AARCH-NEXT: adds x12, x12, x13
-; AARCH-NEXT: asr x13, x1, #63
-; AARCH-NEXT: adc x11, x11, x14
-; AARCH-NEXT: adds x9, x12, x9
-; AARCH-NEXT: adc x10, x11, x10
-; AARCH-NEXT: cmp x9, x13
-; AARCH-NEXT: ccmp x10, x13, #0, eq
+; AARCH-NEXT: adc x12, x12, x13
+; AARCH-NEXT: adds x9, x16, x9
+; AARCH-NEXT: adc x11, x11, x12
+; AARCH-NEXT: cmp x9, x10
+; AARCH-NEXT: ccmp x11, x10, #0, eq
; AARCH-NEXT: cset w9, ne
; AARCH-NEXT: tbz x8, #63, .LBB1_2
; AARCH-NEXT: // %bb.1: // %Entry
diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
index 67a10d4bcbaea9..43d56e5d5eb2fb 100644
--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -10,43 +10,33 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; LA32-NEXT: add.w $a5, $a6, $a5
; LA32-NEXT: sltu $a6, $a5, $a6
; LA32-NEXT: mulh.wu $a7, $a1, $a2
+; LA32-NEXT: srai.w $t0, $a1, 31
+; LA32-NEXT: mul.w $t0, $t0, $a2
+; LA32-NEXT: add.w $a7, $a7, $t0
; LA32-NEXT: add.w $a6, $a7, $a6
-; LA32-NEXT: mul.w $a7, $a0, $a3
-; LA32-NEXT: add.w $a5, $a7, $a5
-; LA32-NEXT: sltu $a7, $a5, $a7
-; LA32-NEXT: mulh.wu $t0, $a0, $a3
-; LA32-NEXT: add.w $a7, $t0, $a7
-; LA32-NEXT: add.w $a7, $a6, $a7
-; LA32-NEXT: mul.w $t0, $a1, $a3
-; LA32-NEXT: add.w $t1, $t0, $a7
-; LA32-NEXT: srai.w $t2, $a1, 31
-; LA32-NEXT: mul.w $t3, $a2, $t2
-; LA32-NEXT: srai.w $t4, $a3, 31
-; LA32-NEXT: mul.w $t5, $t4, $a0
-; LA32-NEXT: add.w $t6, $t5, $t3
-; LA32-NEXT: add.w $t7, $t1, $t6
-; LA32-NEXT: sltu $t8, $t7, $t1
-; LA32-NEXT: sltu $t0, $t1, $t0
-; LA32-NEXT: sltu $a6, $a7, $a6
-; LA32-NEXT: mulh.wu $a7, $a1, $a3
+; LA32-NEXT: mulh.wu $a7, $a0, $a3
+; LA32-NEXT: srai.w $t0, $a3, 31
+; LA32-NEXT: mul.w $t0, $a0, $t0
+; LA32-NEXT: add.w $a7, $a7, $t0
+; LA32-NEXT: mul.w $t0, $a0, $a3
+; LA32-NEXT: add.w $a5, $t0, $a5
+; LA32-NEXT: sltu $t0, $a5, $t0
+; LA32-NEXT: add.w $a7, $a7, $t0
+; LA32-NEXT: add.w $t0, $a6, $a7
+; LA32-NEXT: sltu $t1, $t0, $a6
+; LA32-NEXT: srai.w $a6, $a6, 31
+; LA32-NEXT: srai.w $a7, $a7, 31
+; LA32-NEXT: add.w $a6, $a6, $a7
+; LA32-NEXT: add.w $a6, $a6, $t1
+; LA32-NEXT: mulh.w $a7, $a1, $a3
; LA32-NEXT: add.w $a6, $a7, $a6
-; LA32-NEXT: add.w $a6, $a6, $t0
-; LA32-NEXT: mulh.wu $a7, $a2, $t2
-; LA32-NEXT: add.w $a7, $a7, $t3
-; LA32-NEXT: mul.w $a3, $a3, $t2
-; LA32-NEXT: add.w $a3, $a7, $a3
-; LA32-NEXT: mul.w $a1, $t4, $a1
-; LA32-NEXT: mulh.wu $a7, $t4, $a0
-; LA32-NEXT: add.w $a1, $a7, $a1
-; LA32-NEXT: add.w $a1, $a1, $t5
-; LA32-NEXT: add.w $a1, $a1, $a3
-; LA32-NEXT: sltu $a3, $t6, $t5
-; LA32-NEXT: add.w $a1, $a1, $a3
+; LA32-NEXT: mul.w $a1, $a1, $a3
+; LA32-NEXT: add.w $a3, $a1, $t0
+; LA32-NEXT: sltu $a1, $a3, $a1
; LA32-NEXT: add.w $a1, $a6, $a1
-; LA32-NEXT: add.w $a1, $a1, $t8
-; LA32-NEXT: srai.w $a3, $a5, 31
-; LA32-NEXT: xor $a1, $a1, $a3
-; LA32-NEXT: xor $a3, $t7, $a3
+; LA32-NEXT: srai.w $a6, $a5, 31
+; LA32-NEXT: xor $a1, $a1, $a6
+; LA32-NEXT: xor $a3, $a3, $a6
; LA32-NEXT: or $a1, $a3, $a1
; LA32-NEXT: sltu $a1, $zero, $a1
; LA32-NEXT: mul.w $a0, $a0, $a2
@@ -74,19 +64,19 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; LA32-LABEL: smuloi128:
; LA32: # %bb.0:
-; LA32-NEXT: addi.w $sp, $sp, -96
-; LA32-NEXT: .cfi_def_cfa_offset 96
-; LA32-NEXT: st.w $ra, $sp, 92 # 4-byte Folded Spill
-; LA32-NEXT: st.w $fp, $sp, 88 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s0, $sp, 84 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s1, $sp, 80 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s2, $sp, 76 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s3, $sp, 72 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s4, $sp, 68 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s5, $sp, 64 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s6, $sp, 60 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s7, $sp, 56 # 4-byte Folded Spill
-; LA32-NEXT: st.w $s8, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: .cfi_def_cfa_offset 48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s7, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s8, $sp, 4 # 4-byte Folded Spill
; LA32-NEXT: .cfi_offset 1, -4
; LA32-NEXT: .cfi_offset 22, -8
; LA32-NEXT: .cfi_offset 23, -12
@@ -98,295 +88,218 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; LA32-NEXT: .cfi_offset 29, -36
; LA32-NEXT: .cfi_offset 30, -40
; LA32-NEXT: .cfi_offset 31, -44
-; LA32-NEXT: st.w $a2, $sp, 48 # 4-byte Folded Spill
-; LA32-NEXT: ld.w $t0, $a1, 12
-; LA32-NEXT: ld.w $t1, $a1, 8
-; LA32-NEXT: ld.w $a5, $a0, 12
-; LA32-NEXT: ld.w $a7, $a1, 0
-; LA32-NEXT: ld.w $a3, $a0, 0
-; LA32-NEXT: ld.w $a6, $a0, 4
-; LA32-NEXT: ld.w $a4, $a0, 8
-; LA32-NEXT: ld.w $t3, $a1, 4
-; LA32-NEXT: mulh.wu $a0, $a3, $a7
-; LA32-NEXT: mul.w $a1, $a6, $a7
+; LA32-NEXT: ld.w $a5, $a1, 12
+; LA32-NEXT: ld.w $a6, $a1, 8
+; LA32-NEXT: ld.w $t1, $a0, 4
+; LA32-NEXT: ld.w $a3, $a1, 0
+; LA32-NEXT: ld.w $a7, $a0, 8
+; LA32-NEXT: ld.w $t0, $a0, 12
+; LA32-NEXT: ld.w $a4, $a0, 0
+; LA32-NEXT: ld.w $t2, $a1, 4
+; LA32-NEXT: mulh.wu $a0, $a7, $a3
+; LA32-NEXT: mul.w $a1, $t0, $a3
; LA32-NEXT: add.w $a0, $a1, $a0
; LA32-NEXT: sltu $a1, $a0, $a1
-; LA32-NEXT: mulh.wu $t2, $a6, $a7
-; LA32-NEXT: add.w $a1, $t2, $a1
-; LA32-NEXT: mul.w $t2, $a3, $t3
-; LA32-NEXT: add.w $a0, $t2, $a0
-; LA32-NEXT: st.w $a0, $sp, 44 # 4-byte Folded Spill
-; LA32-NEXT: sltu $t2, $a0, $t2
-; LA32-NEXT: mulh.wu $t4, $a3, $t3
-; LA32-NEXT: add.w $t2, $t4, $t2
-; LA32-NEXT: add.w $t2, $a1, $t2
-; LA32-NEXT: mul.w $t4, $a6, $t3
-; LA32-NEXT: add.w $t5, $t4, $t2
-; LA32-NEXT: sltu $t4, $t5, $t4
-; LA32-NEXT: sltu $a1, $t2, $a1
-; LA32-NEXT: mulh.wu $t2, $a6, $t3
-; LA32-NEXT: add.w $a1, $t2, $a1
-; LA32-NEXT: add.w $a1, $a1, $t4
-; LA32-NEXT: mulh.wu $t2, $a4, $a7
-; LA32-NEXT: mul.w $t4, $a5, $a7
+; LA32-NEXT: mulh.wu $t3, $t0, $a3
+; LA32-NEXT: add.w $a1, $t3, $a1
+; LA32-NEXT: mul.w $t3, $a7, $t2
+; LA32-NEXT: add.w $t4, $t3, $a0
+; LA32-NEXT: sltu $a0, $t4, $t3
+; LA32-NEXT: mulh.wu $t3, $a7, $t2
+; LA32-NEXT: add.w $a0, $t3, $a0
+; LA32-NEXT: add.w $t3, $a1, $a0
+; LA32-NEXT: mul.w $t5, $t0, $t2
+; LA32-NEXT: add.w $t6, $t5, $t3
+; LA32-NEXT: srai.w $a0, $t0, 31
+; LA32-NEXT: mul.w $t7, $a3, $a0
+; LA32-NEXT: add.w $t8, $t6, $t7
+; LA32-NEXT: sltu $fp, $t8, $t6
+; LA32-NEXT: sltu $t5, $t6, $t5
+; LA32-NEXT: sltu $a1, $t3, $a1
+; LA32-NEXT: mulh.wu $t3, $t0, $t2
+; LA32-NEXT: add.w $a1, $t3, $a1
+; LA32-NEXT: add.w $a1, $a1, $t5
+; LA32-NEXT: mulh.wu $t3, $a3, $a0
+; LA32-NEXT: add.w $t3, $t3, $t7
+; LA32-NEXT: mul.w $t5, $t2, $a0
+; LA32-NEXT: add.w $t3, $t3, $t5
+; LA32-NEXT: add.w $a1, $a1, $t3
+; LA32-NEXT: add.w $t3, $a1, $fp
+; LA32-NEXT: mulh.wu $a1, $a4, $a3
+; LA32-NEXT: mul.w $t5, $t1, $a3
+; LA32-NEXT: add.w $a1, $t5, $a1
+; LA32-NEXT: sltu $t5, $a1, $t5
+; LA32-NEXT: mulh.wu $t6, $t1, $a3
+; LA32-NEXT: add.w $t5, $t6, $t5
+; LA32-NEXT: mul.w $t6, $a4, $t2
+; LA32-NEXT: add.w $a1, $t6, $a1
+; LA32-NEXT: sltu $t6, $a1, $t6
+; LA32-NEXT: mulh.wu $t7, $a4, $t2
+; LA32-NEXT: add.w $t6, $t7, $t6
+; LA32-NEXT: add.w $t6, $t5, $t6
+; LA32-NEXT: mul.w $t7, $t1, $t2
+; LA32-NEXT: add.w $fp, $t7, $t6
+; LA32-NEXT: sltu $t7, $fp, $t7
+; LA32-NEXT: sltu $t5, $t6, $t5
+; LA32-NEXT: mulh.wu $t2, $t1, $t2
+; LA32-NEXT: add.w $t2, $t2, $t5
+; LA32-NEXT: add.w $t2, $t2, $t7
; LA32-NEXT: add.w $t2, $t4, $t2
-; LA32-NEXT: mul.w $t6, $a4, $t3
-; LA32-NEXT: add.w $t7, $t6, $t2
-; LA32-NEXT: add.w $a1, $t7, $a1
-; LA32-NEXT: mul.w $t8, $a4, $a7
-; LA32-NEXT: add.w $t5, $t8, $t5
-; LA32-NEXT: sltu $t8, $t5, $t8
-; LA32-NEXT: add.w $a1, $a1, $t8
-; LA32-NEXT: sltu $fp, $a1, $t7
-; LA32-NEXT: xor $s0, $a1, $t7
-; LA32-NEXT: sltui $s0, $s0, 1
-; LA32-NEXT: masknez $fp, $fp, $s0
-; LA32-NEXT: maskeqz $t8, $t8, $s0
-; LA32-NEXT: or $t8, $t8, $fp
-; LA32-NEXT: sltu $t2, $t2, $t4
-; LA32-NEXT: mulh.wu $t4, $a5, $a7
-; LA32-NEXT: add.w $t4, $t4, $t2
-; LA32-NEXT: sltu $t2, $t7, $t6
-; LA32-NEXT: mulh.wu $t6, $a4, $t3
-; LA32-NEXT: add.w $t2, $t6, $t2
-; LA32-NEXT: add.w $fp, $t4, $t2
-; LA32-NEXT: mul.w $t6, $a5, $t3
-; LA32-NEXT: add.w $s0, $t6, $fp
+; LA32-NEXT: mul.w $t5, $a7, $a3
+; LA32-NEXT: add.w $t6, $t5, $fp
+; LA32-NEXT: sltu $t5, $t6, $t5
+; LA32-NEXT: add.w $t2, $t2, $t5
+; LA32-NEXT: sltu $t7, $t2, $t4
+; LA32-NEXT: xor $t4, $t2, $t4
+; LA32-NEXT: sltui $t4, $t4, 1
+; LA32-NEXT: masknez $t7, $t7, $t4
+; LA32-NEXT: maskeqz $t4, $t5, $t4
+; LA32-NEXT: or $t4, $t4, $t7
+; LA32-NEXT: add.w $t5, $t8, $t4
+; LA32-NEXT: sltu $t4, $t5, $t8
+; LA32-NEXT: add.w $t4, $t3, $t4
+; LA32-NEXT: mulh.wu $t3, $a4, $a6
+; LA32-NEXT: mul.w $t7, $t1, $a6
+; LA32-NEXT: add.w $t3, $t7, $t3
+; LA32-NEXT: sltu $t7, $t3, $t7
+; LA32-NEXT: mulh.wu $t8, $t1, $a6
+; LA32-NEXT: add.w $t7, $t8, $t7
+; LA32-NEXT: mul.w $t8, $a4, $a5
+; LA32-NEXT: add.w $fp, $t8, $t3
+; LA32-NEXT: sltu $t3, $fp, $t8
+; LA32-NEXT: mulh.wu $t8, $a4, $a5
+; LA32-NEXT: add.w $t3, $t8, $t3
+; LA32-NEXT: add.w $t8, $t7, $t3
+; LA32-NEXT: mul.w $s0, $t1, $a5
; LA32-NEXT: add.w $s1, $s0, $t8
-; LA32-NEXT: mulh.wu $t2, $a3, $t1
-; LA32-NEXT: mul.w $t7, $a6, $t1
-; LA32-NEXT: add.w $t8, $t7, $t2
-; LA32-NEXT: mul.w $s2, $a3, $t0
-; LA32-NEXT: add.w $s3, $s2, $t8
-; LA32-NEXT: add.w $t2, $s3, $a1
-; LA32-NEXT: mul.w $s4, $a3, $t1
-; LA32-NEXT: add.w $a0, $s4, $t5
-; LA32-NEXT: st.w $a0, $sp, 40 # 4-byte Folded Spill
-; LA32-NEXT: sltu $t5, $a0, $s4
-; LA32-NEXT: add.w $a0, $t2, $t5
-; LA32-NEXT: st.w $a0, $sp, 36 # 4-byte Folded Spill
-; LA32-NEXT: sltu $s4, $a0, $s3
-; LA32-NEXT: xor $s5, $a0, $s3
-; LA32-NEXT: sltui $s5, $s5, 1
-; LA32-NEXT: masknez $s4, $s4, $s5
-; LA32-NEXT: maskeqz $t5, $t5, $s5
-; LA32-NEXT: or $t5, $t5, $s4
+; LA32-NEXT: srai.w $t3, $a5, 31
+; LA32-NEXT: mul.w $s2, $t3, $a4
+; LA32-NEXT: add.w $s3, $s1, $s2
+; LA32-NEXT: sltu $s4, $s3, $s1
+; LA32-NEXT: sltu $s0, $s1, $s0
; LA32-NEXT: sltu $t7, $t8, $t7
-; LA32-NEXT: mulh.wu $t8, $a6, $t1
-; LA32-NEXT: add.w $s4, $t8, $t7
-; LA32-NEXT: sltu $t7, $s3, $s2
-; LA32-NEXT: mulh.wu $t8, $a3, $t0
+; LA32-NEXT: mulh.wu $t8, $t1, $a5
; LA32-NEXT: add.w $t7, $t8, $t7
-; LA32-NEXT: add.w $s2, $s4, $t7
-; LA32-NEXT: mul.w $s3, $a6, $t0
-; LA32-NEXT: add.w $s6, $s3, $s2
-; LA32-NEXT: add.w $s7, $s6, $t5
-; LA32-NEXT: add.w $s5, $s1, $s7
-; LA32-NEXT: mul.w $s8, $a4, $t1
-; LA32-NEXT: add.w $ra, $s8, $s5
-; LA32-NEXT: srai.w $t8, $a5, 31
-; LA32-NEXT: mul.w $t7, $a7, $t8
-; LA32-NEXT: st.w $a7, $sp, 28 # 4-byte Folded Spill
-; LA32-NEXT: srai.w $t5, $t0, 31
-; LA32-NEXT: sltu $s5, $s5, $s1
-; LA32-NEXT: sltu $s1, $s1, $s0
-; LA32-NEXT: sltu $s0, $s0, $t6
-; LA32-NEXT: mul.w $t2, $t5, $a3
-; LA32-NEXT: st.w $a3, $sp, 24 # 4-byte Folded Spill
-; LA32-NEXT: sltu $t4, $fp, $t4
-; LA32-NEXT: mulh.wu $fp, $a5, $t3
-; LA32-NEXT: st.w $a5, $sp, 0 # 4-byte Folded Spill
-; LA32-NEXT: add.w $t4, $fp, $t4
-; LA32-NEXT: add.w $fp, $t2, $t7
-; LA32-NEXT: add.w $s0, $t4, $s0
-; LA32-NEXT: add.w $a0, $ra, $fp
-; LA32-NEXT: st.w $a0, $sp, 32 # 4-byte Folded Spill
-; LA32-NEXT: add.w $a2, $s0, $s1
-; LA32-NEXT: sltu $s0, $a0, $ra
-; LA32-NEXT: sltu $s1, $s7, $s6
-; LA32-NEXT: sltu $s3, $s6, $s3
-; LA32-NEXT: sltu $s2, $s2, $s4
-; LA32-NEXT: move $s6, $a6
-; LA32-NEXT: st.w $a6, $sp, 16 # 4-byte Folded Spill
-; LA32-NEXT: mulh.wu $s4, $a6, $t0
-; LA32-NEXT: add.w $s2, $s4, $s2
-; LA32-NEXT: add.w $s2, $s2, $s3
-; LA32-NEXT: add.w $s1, $s2, $s1
-; LA32-NEXT: add.w $s1, $a2, $s1
-; LA32-NEXT: add.w $s7, $s1, $s5
-; LA32-NEXT: move $a0, $a4
-; LA32-NEXT: st.w $a4, $sp, 4 # 4-byte Folded Spill
-; LA32-NEXT: mulh.wu $s1, $a4, $t1
-; LA32-NEXT: mul.w $a5, $a5, $t1
-; LA32-NEXT: add.w $a4, $a5, $s1
-; LA32-NEXT: mul.w $a6, $a0, $t0
-; LA32-NEXT: add.w $a1, $a6, $a4
-; LA32-NEXT: sltu $ra, $ra, $s8
-; LA32-NEXT: add.w $s1, $a1, $s7
-; LA32-NEXT: add.w $s8, $s1, $ra
-; LA32-NEXT: move $a0, $t2
-; LA32-NEXT: st.w $t2, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT: sltu $t6, $fp, $t2
-; LA32-NEXT: mulh.wu $t2, $a7, $t8
-; LA32-NEXT: mul.w $s4, $t3, $t8
-; LA32-NEXT: add.w $a7, $s4, $t2
-; LA32-NEXT: st.w $a7, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT: add.w $s3, $t7, $a7
-; LA32-NEXT: mulh.wu $a7, $t5, $a3
-; LA32-NEXT: add.w $t4, $a0, $a7
-; LA32-NEXT: mul.w $s2, $t5, $s6
-; LA32-NEXT: add.w $s1, $s2, $t4
-; LA32-NEXT: add.w $fp, $s1, $s3
-; LA32-NEXT: add.w $a0, $fp, $t6
-; LA32-NEXT: add.w $fp, $s8, $a0
-; LA32-NEXT: add.w $a3, $fp, $s0
-; LA32-NEXT: st.w $a3, $sp, 20 # 4-byte Folded Spill
-; LA32-NEXT: xor $fp, $a3, $s8
+; LA32-NEXT: add.w $t7, $t7, $s0
+; LA32-NEXT: mul.w $t1, $t3, $t1
+; LA32-NEXT: mulh.wu $t8, $t3, $a4
+; LA32-NEXT: add.w $t1, $t8, $t1
+; LA32-NEXT: add.w $t1, $t1, $s2
+; LA32-NEXT: add.w $t1, $t7, $t1
+; LA32-NEXT: add.w $t7, $t1, $s4
+; LA32-NEXT: add.w $t2, $fp, $t2
+; LA32-NEXT: mul.w $t8, $a4, $a6
+; LA32-NEXT: add.w $t1, $t8, $t6
+; LA32-NEXT: sltu $t6, $t1, $t8
+; LA32-NEXT: add.w $t2, $t2, $t6
+; LA32-NEXT: sltu $t8, $t2, $fp
+; LA32-NEXT: xor $fp, $t2, $fp
; LA32-NEXT: sltui $fp, $fp, 1
-; LA32-NEXT: sltu $s6, $a3, $s8
-; LA32-NEXT: masknez $s6, $s6, $fp
-; LA32-NEXT: maskeqz $fp, $s0, $fp
-; LA32-NEXT: or $s6, $fp, $s6
-; LA32-NEXT: sltu $fp, $s7, $a2
-; LA32-NEXT: xor $a2, $s7, $a2
-; LA32-NEXT: sltui $a2, $a2, 1
-; LA32-NEXT: masknez $fp, $fp, $a2
-; LA32-NEXT: maskeqz $a2, $s5, $a2
-; LA32-NEXT: or $s0, $a2, $fp
-; LA32-NEXT: sltu $a2, $a4, $a5
-; LA32-NEXT: ld.w $a5, $sp, 0 # 4-byte Folded Reload
-; LA32-NEXT: mulh.wu $a3, $a5, $t1
-; LA32-NEXT: add.w $a2, $a3, $a2
-; LA32-NEXT: sltu $a3, $a1, $a6
-; LA32-NEXT: ld.w $fp, $sp, 4 # 4-byte Folded Reload
-; LA32-NEXT: mulh.wu $a4, $fp, $t0
-; LA32-NEXT: add.w $a3, $a4, $a3
-; LA32-NEXT: sltu $a4, $s8, $a1
-; LA32-NEXT: xor $a1, $s8, $a1
-; LA32-NEXT: sltui $a1, $a1, 1
-; LA32-NEXT: masknez $a4, $a4, $a1
-; LA32-NEXT: maskeqz $a1, $ra, $a1
-; LA32-NEXT: or $a1, $a1, $a4
-; LA32-NEXT: sltu $a4, $a0, $s1
-; LA32-NEXT: xor $a0, $a0, $s1
-; LA32-NEXT: sltui $a0, $a0, 1
-; LA32-NEXT: masknez $a4, $a4, $a0
-; LA32-NEXT: maskeqz $a0, $t6, $a0
-; LA32-NEXT: or $s5, $a0, $a4
-; LA32-NEXT: sltu $a0, $s3, $t7
-; LA32-NEXT: add.w $a0, $t2, $a0
-; LA32-NEXT: ld.w $t2, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT: sltu $a4, $t4, $t2
-; LA32-NEXT: add.w $s7, $a7, $a4
-; LA32-NEXT: add.w $a3, $a2, $a3
-; LA32-NEXT: sltu $a2, $a3, $a2
-; LA32-NEXT: mulh.wu $a4, $a5, $t0
-; LA32-NEXT: add.w $a2, $a4, $a2
-; LA32-NEXT: mul.w $a4, $a5, $t0
-; LA32-NEXT: move $a6, $a5
-; LA32-NEXT: add.w $a3, $a4, $a3
-; LA32-NEXT: sltu $a4, $a3, $a4
-; LA32-NEXT: add.w $a2, $a2, $a4
-; LA32-NEXT: add.w $a4, $a3, $s0
-; LA32-NEXT: sltu $a3, $a4, $a3
-; LA32-NEXT: add.w $a2, $a2, $a3
-; LA32-NEXT: add.w $s8, $a4, $a1
-; LA32-NEXT: sltu $a1, $s8, $a4
-; LA32-NEXT: add.w $ra, $a2, $a1
-; LA32-NEXT: ld.w $a1, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT: sltu $a1, $a1, $s4
-; LA32-NEXT: mulh.wu $a2, $t3, $t8
-; LA32-NEXT: add.w $a1, $a2, $a1
-; LA32-NEXT: add.w $a0, $a1, $a0
-; LA32-NEXT: sltu $a1, $a0, $a1
-; LA32-NEXT: add.w $a1, $a2, $a1
-; LA32-NEXT: add.w $a0, $s4, $a0
-; LA32-NEXT: sltu $a2, $a0, $s4
-; LA32-NEXT: add.w $a1, $a1, $a2
-; LA32-NEXT: mul.w $a2, $t8, $t1
-; LA32-NEXT: mul.w $a3, $t8, $t0
-; LA32-NEXT: mulh.wu $a4, $t8, $t1
-; LA32-NEXT: add.w $a3, $a4, $a3
-; LA32-NEXT: add.w $a3, $a3, $a2
-; LA32-NEXT: add.w $a3, $s3, $a3
-; LA32-NEXT: add.w $a2, $t7, $a2
-; LA32-NEXT: sltu $a4, $a2, $t7
-; LA32-NEXT: add.w $a3, $a3, $a4
-; LA32-NEXT: add.w $a1, $a1, $a3
-; LA32-NEXT: add.w $a2, $a0, $a2
-; LA32-NEXT: sltu $a0, $a2, $a0
-; LA32-NEXT: add.w $a0, $a1, $a0
-; LA32-NEXT: sltu $a1, $s1, $s2
-; LA32-NEXT: ld.w $a3, $sp, 16 # 4-byte Folded Reload
-; LA32-NEXT: mulh.wu $a3, $t5, $a3
-; LA32-NEXT: add.w $a1, $a3, $a1
-; LA32-NEXT: add.w $a1, $s7, $a1
-; LA32-NEXT: sltu $a4, $a1, $s7
-; LA32-NEXT: add.w $a3, $a3, $a4
-; LA32-NEXT: add.w $a1, $s2, $a1
-; LA32-NEXT: sltu $a4, $a1, $s2
-; LA32-NEXT: add.w $a3, $a3, $a4
-; LA32-NEXT: mul.w $a4, $fp, $t5
-; LA32-NEXT: mulh.wu $a5, $fp, $t5
-; LA32-NEXT: mul.w $a6, $a6, $t5
-; LA32-NEXT: add.w $a5, $a5, $a4
-; LA32-NEXT: add.w $a5, $a5, $a6
-; LA32-NEXT: add.w $a5, $a5, $s1
-; LA32-NEXT: add.w $a6, $a4, $t2
-; LA32-NEXT: sltu $a4, $a6, $a4
-; LA32-NEXT: add.w $a4, $a5, $a4
-; LA32-NEXT: add.w $a3, $a3, $a4
-; LA32-NEXT: add.w $a4, $a1, $a6
-; LA32-NEXT: sltu $a1, $a4, $a1
-; LA32-NEXT: add.w $a1, $a3, $a1
-; LA32-NEXT: add.w $a0, $a1, $a0
-; LA32-NEXT: add.w $a1, $a4, $a2
-; LA32-NEXT: sltu $a2, $a1, $a4
-; LA32-NEXT: add.w $a0, $a0, $a2
-; LA32-NEXT: add.w $a2, $a1, $s5
-; LA32-NEXT: sltu $a1, $a2, $a1
-; LA32-NEXT: add.w $a0, $a0, $a1
-; LA32-NEXT: add.w $a0, $ra, $a0
-; LA32-NEXT: add.w $a1, $s8, $a2
-; LA32-NEXT: sltu $a2, $a1, $s8
-; LA32-NEXT: add.w $a0, $a0, $a2
-; LA32-NEXT: add.w $a2, $a1, $s6
-; LA32-NEXT: sltu $a1, $a2, $a1
-; LA32-NEXT: add.w $a0, $a0, $a1
-; LA32-NEXT: ld.w $a4, $sp, 36 # 4-byte Folded Reload
-; LA32-NEXT: srai.w $a1, $a4, 31
-; LA32-NEXT: xor $a0, $a0, $a1
-; LA32-NEXT: ld.w $a3, $sp, 20 # 4-byte Folded Reload
-; LA32-NEXT: xor $a3, $a3, $a1
-; LA32-NEXT: or $a0, $a3, $a0
-; LA32-NEXT: xor $a2, $a2, $a1
-; LA32-NEXT: ld.w $a3, $sp, 32 # 4-byte Folded Reload
-; LA32-NEXT: xor $a1, $a3, $a1
-; LA32-NEXT: or $a1, $a1, $a2
-; LA32-NEXT: or $a0, $a1, $a0
-; LA32-NEXT: ld.w $a1, $sp, 28 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $a2, $sp, 24 # 4-byte Folded Reload
-; LA32-NEXT: mul.w $a1, $a2, $a1
-; LA32-NEXT: ld.w $a2, $sp, 48 # 4-byte Folded Reload
-; LA32-NEXT: st.w $a1, $a2, 0
-; LA32-NEXT: ld.w $a1, $sp, 44 # 4-byte Folded Reload
-; LA32-NEXT: st.w $a1, $a2, 4
-; LA32-NEXT: ld.w $a1, $sp, 40 # 4-byte Folded Reload
-; LA32-NEXT: st.w $a1, $a2, 8
+; LA32-NEXT: masknez $t8, $t8, $fp
+; LA32-NEXT: maskeqz $t6, $t6, $fp
+; LA32-NEXT: or $t6, $t6, $t8
+; LA32-NEXT: add.w $t6, $s3, $t6
+; LA32-NEXT: sltu $t8, $t6, $s3
+; LA32-NEXT: add.w $t7, $t7, $t8
+; LA32-NEXT: add.w $t8, $t4, $t7
+; LA32-NEXT: add.w $t6, $t5, $t6
+; LA32-NEXT: sltu $fp, $t6, $t5
+; LA32-NEXT: add.w $t8, $t8, $fp
+; LA32-NEXT: mulh.wu $t5, $a7, $a6
+; LA32-NEXT: mul.w $s0, $t0, $a6
+; LA32-NEXT: add.w $s1, $s0, $t5
+; LA32-NEXT: mul.w $s2, $a7, $a5
+; LA32-NEXT: add.w $s3, $s2, $s1
+; LA32-NEXT: add.w $s4, $s3, $t8
+; LA32-NEXT: mul.w $s5, $a7, $a6
+; LA32-NEXT: add.w $t5, $s5, $t6
+; LA32-NEXT: sltu $s5, $t5, $s5
+; LA32-NEXT: add.w $t6, $s4, $s5
+; LA32-NEXT: sltu $s4, $t6, $s3
+; LA32-NEXT: xor $s6, $t6, $s3
+; LA32-NEXT: sltui $s6, $s6, 1
+; LA32-NEXT: masknez $s4, $s4, $s6
+; LA32-NEXT: maskeqz $s5, $s5, $s6
+; LA32-NEXT: or $s4, $s5, $s4
+; LA32-NEXT: sltu $s5, $t8, $t4
+; LA32-NEXT: xor $t8, $t8, $t4
+; LA32-NEXT: sltui $t8, $t8, 1
+; LA32-NEXT: masknez $s5, $s5, $t8
+; LA32-NEXT: maskeqz $t8, $fp, $t8
+; LA32-NEXT: or $t8, $t8, $s5
+; LA32-NEXT: srai.w $t4, $t4, 31
+; LA32-NEXT: srai.w $t7, $t7, 31
+; LA32-NEXT: add.w $t7, $t4, $t7
+; LA32-NEXT: add.w $t8, $t7, $t8
+; LA32-NEXT: sltu $fp, $s1, $s0
+; LA32-NEXT: mulh.wu $s0, $t0, $a6
+; LA32-NEXT: add.w $fp, $s0, $fp
+; LA32-NEXT: sltu $s0, $s3, $s2
+; LA32-NEXT: mulh.wu $s1, $a7, $a5
+; LA32-NEXT: add.w $s0, $s1, $s0
+; LA32-NEXT: add.w $s0, $fp, $s0
+; LA32-NEXT: mul.w $s1, $t0, $a5
+; LA32-NEXT: add.w $s2, $s1, $s0
+; LA32-NEXT: mul.w $s3, $a6, $a0
+; LA32-NEXT: mul.w $s5, $t3, $a7
+; LA32-NEXT: add.w $s6, $s5, $s3
+; LA32-NEXT: add.w $s7, $s2, $s6
+; LA32-NEXT: add.w $s8, $s7, $t8
+; LA32-NEXT: add.w $s4, $s8, $s4
+; LA32-NEXT: sltu $ra, $s4, $s8
+; LA32-NEXT: sltu $t4, $t7, $t4
+; LA32-NEXT: add.w $t4, $t7, $t4
+; LA32-NEXT: sltu $t7, $t8, $t7
+; LA32-NEXT: add.w $t4, $t4, $t7
+; LA32-NEXT: sltu $t7, $s7, $s2
+; LA32-NEXT: sltu $t8, $s2, $s1
+; LA32-NEXT: sltu $fp, $s0, $fp
+; LA32-NEXT: mulh.wu $s0, $t0, $a5
+; LA32-NEXT: add.w $fp, $s0, $fp
+; LA32-NEXT: add.w $t8, $fp, $t8
+; LA32-NEXT: mulh.wu $a6, $a6, $a0
+; LA32-NEXT: add.w $a6, $a6, $s3
+; LA32-NEXT: mul.w $a0, $a5, $a0
+; LA32-NEXT: add.w $a0, $a6, $a0
+; LA32-NEXT: mul.w $a5, $t3, $t0
+; LA32-NEXT: mulh.wu $a6, $t3, $a7
+; LA32-NEXT: add.w $a5, $a6, $a5
+; LA32-NEXT: add.w $a5, $a5, $s5
+; LA32-NEXT: add.w $a0, $a5, $a0
+; LA32-NEXT: sltu $a5, $s6, $s5
+; LA32-NEXT: add.w $a0, $a0, $a5
+; LA32-NEXT: add.w $a0, $t8, $a0
+; LA32-NEXT: add.w $a0, $a0, $t7
+; LA32-NEXT: add.w $a0, $a0, $t4
+; LA32-NEXT: sltu $a5, $s8, $s7
+; LA32-NEXT: add.w $a0, $a0, $a5
+; LA32-NEXT: add.w $a0, $a0, $ra
+; LA32-NEXT: srai.w $a5, $t2, 31
+; LA32-NEXT: xor $a0, $a0, $a5
+; LA32-NEXT: xor $a6, $t6, $a5
+; LA32-NEXT: or $a0, $a6, $a0
+; LA32-NEXT: xor $a6, $s4, $a5
+; LA32-NEXT: xor $a5, $t5, $a5
+; LA32-NEXT: or $a5, $a5, $a6
+; LA32-NEXT: or $a0, $a5, $a0
; LA32-NEXT: sltu $a0, $zero, $a0
-; LA32-NEXT: st.w $a4, $a2, 12
-; LA32-NEXT: ld.w $s8, $sp, 52 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s7, $sp, 56 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s6, $sp, 60 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s5, $sp, 64 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s4, $sp, 68 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s3, $sp, 72 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s2, $sp, 76 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s1, $sp, 80 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $s0, $sp, 84 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $fp, $sp, 88 # 4-byte Folded Reload
-; LA32-NEXT: ld.w $ra, $sp, 92 # 4-byte Folded Reload
-; LA32-NEXT: addi.w $sp, $sp, 96
+; LA32-NEXT: mul.w $a3, $a4, $a3
+; LA32-NEXT: st.w $a3, $a2, 0
+; LA32-NEXT: st.w $a1, $a2, 4
+; LA32-NEXT: st.w $t1, $a2, 8
+; LA32-NEXT: st.w $t2, $a2, 12
+; LA32-NEXT: ld.w $s8, $sp, 4 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s7, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
; LA32-NEXT: ret
;
; LA64-LABEL: smuloi128:
@@ -396,43 +309,33 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; LA64-NEXT: add.d $a5, $a6, $a5
; LA64-NEXT: sltu $a6, $a5, $a6
; LA64-NEXT: mulh.du $a7, $a1, $a2
+; LA64-NEXT: srai.d $t0, $a1, 63
+; LA64-NEXT: mul.d $t0, $t0, $a2
+; LA64-NEXT: add.d $a7, $a7, $t0
; LA64-NEXT: add.d $a6, $a7, $a6
-; LA64-NEXT: mul.d $a7, $a0, $a3
-; LA64-NEXT: add.d $a5, $a7, $a5
-; LA64-NEXT: sltu $a7, $a5, $a7
-; LA64-NEXT: mulh.du $t0, $a0, $a3
-; LA64-NEXT: add.d $a7, $t0, $a7
-; LA64-NEXT: add.d $a7, $a6, $a7
-; LA64-NEXT: mul.d $t0, $a1, $a3
-; LA64-NEXT: add.d $t1, $t0, $a7
-; LA64-NEXT: srai.d $t2, $a1, 63
-; LA64-NEXT: mul.d $t3, $a2, $t2
-; LA64-NEXT: srai.d $t4, $a3, 63
-; LA64-NEXT: mul.d $t5, $t4, $a0
-; LA64-NEXT: add.d $t6, $t5, $t3
-; LA64-NEXT: add.d $t7, $t1, $t6
-; LA64-NEXT: sltu $t8, $t7, $t1
-; LA64-NEXT: sltu $t0, $t1, $t0
-; LA64-NEXT: sltu $a6, $a7, $a6
-; LA64-NEXT: mulh.du $a7, $a1, $a3
+; LA64-NEXT: mulh.du $a7, $a0, $a3
+; LA64-NEXT: srai.d $t0, $a3, 63
+; LA64-NEXT: mul.d $t0, $a0, $t0
+; LA64-NEXT: add.d $a7, $a7, $t0
+; LA64-NEXT: mul.d $t0, $a0, $a3
+; LA64-NEXT: add.d $a5, $t0, $a5
+; LA64-NEXT: sltu $t0, $a5, $t0
+; LA64-NEXT: add.d $a7, $a7, $t0
+; LA64-NEXT: add.d $t0, $a6, $a7
+; LA64-NEXT: sltu $t1, $t0, $a6
+; LA64-NEXT: srai.d $a6, $a6, 63
+; LA64-NEXT: srai.d $a7, $a7, 63
+; LA64-NEXT: add.d $a6, $a6, $a7
+; LA64-NEXT: add.d $a6, $a6, $t1
+; LA64-NEXT: mulh.d $a7, $a1, $a3
; LA64-NEXT: add.d $a6, $a7, $a6
-; LA64-NEXT: add.d $a6, $a6, $t0
-; LA64-NEXT: mulh.du $a7, $a2, $t2
-; LA64-NEXT: add.d $a7, $a7, $t3
-; LA64-NEXT: mul.d $a3, $a3, $t2
-; LA64-NEXT: add.d $a3, $a7, $a3
-; LA64-NEXT: mul.d $a1, $t4, $a1
-; LA64-NEXT: mulh.du $a7, $t4, $a0
-; LA64-NEXT: add.d $a1, $a7, $a1
-; LA64-NEXT: add.d $a1, $a1, $t5
-; LA64-NEXT: add.d $a1, $a1, $a3
-; LA64-NEXT: sltu $a3, $t6, $t5
-; LA64-NEXT: add.d $a1, $a1, $a3
+; LA64-NEXT: mul.d $a1, $a1, $a3
+; LA64-NEXT: add.d $a3, $a1, $t0
+; LA64-NEXT: sltu $a1, $a3, $a1
; LA64-NEXT: add.d $a1, $a6, $a1
-; LA64-NEXT: add.d $a1, $a1, $t8
-; LA64-NEXT: srai.d $a3, $a5, 63
-; LA64-NEXT: xor $a1, $a1, $a3
-; LA64-NEXT: xor $a3, $t7, $a3
+; LA64-NEXT: srai.d $a6, $a5, 63
+; LA64-NEXT: xor $a1, $a1, $a6
+; LA64-NEXT: xor $a3, $a3, $a6
; LA64-NEXT: or $a1, $a3, $a1
; LA64-NEXT: sltu $a1, $zero, $a1
; LA64-NEXT: mul.d $a0, $a0, $a2
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index c0cbbb3ff9389e..a30593d7d7afb2 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1315,65 +1315,37 @@ entry:
define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
; RV32-LABEL: smulo.i64:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
-; RV32-NEXT: .cfi_offset s1, -8
-; RV32-NEXT: mulhu a6, a0, a2
-; RV32-NEXT: mul a7, a1, a2
-; RV32-NEXT: mulhu t0, a1, a2
-; RV32-NEXT: mul t1, a0, a3
-; RV32-NEXT: mulhu t2, a0, a3
-; RV32-NEXT: mul a5, a1, a3
-; RV32-NEXT: srai t3, a1, 31
-; RV32-NEXT: srai t4, a3, 31
-; RV32-NEXT: mulhu t5, a1, a3
-; RV32-NEXT: mul t6, a0, a2
-; RV32-NEXT: add a6, a7, a6
-; RV32-NEXT: mul s0, a2, t3
-; RV32-NEXT: mul s1, t4, a0
-; RV32-NEXT: mulhu a2, a2, t3
-; RV32-NEXT: mul a3, a3, t3
-; RV32-NEXT: mul a1, t4, a1
-; RV32-NEXT: mulhu a0, t4, a0
-; RV32-NEXT: sltu a7, a6, a7
-; RV32-NEXT: add a6, t1, a6
-; RV32-NEXT: add t3, s1, s0
-; RV32-NEXT: add a2, a2, s0
+; RV32-NEXT: mulhu a5, a0, a2
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhsu a7, a1, a2
+; RV32-NEXT: mul t0, a3, a0
+; RV32-NEXT: mulh t1, a1, a3
+; RV32-NEXT: mul a1, a1, a3
+; RV32-NEXT: mulhsu a3, a3, a0
+; RV32-NEXT: mul a2, a0, a2
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sltu a0, a5, a6
+; RV32-NEXT: add a5, t0, a5
+; RV32-NEXT: add a0, a7, a0
+; RV32-NEXT: sltu a6, a5, t0
+; RV32-NEXT: srai a7, a5, 31
+; RV32-NEXT: add a3, a3, a6
+; RV32-NEXT: srai a6, a0, 31
+; RV32-NEXT: add t0, a0, a3
+; RV32-NEXT: srai a3, a3, 31
+; RV32-NEXT: sltu a0, t0, a0
+; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: add t0, a1, t0
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: sltu a1, t0, a1
+; RV32-NEXT: xor a3, t0, a7
+; RV32-NEXT: add a0, t1, a0
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a7, t0, a7
-; RV32-NEXT: sltu a1, a6, t1
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, s1
-; RV32-NEXT: sltu a3, t3, s1
-; RV32-NEXT: srai t0, a6, 31
-; RV32-NEXT: add a1, t2, a1
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a1, a7, a1
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a2, a5, a1
-; RV32-NEXT: sltu a1, a1, a7
-; RV32-NEXT: add t3, a2, t3
-; RV32-NEXT: sltu a3, a2, a5
-; RV32-NEXT: add a1, t5, a1
-; RV32-NEXT: sltu a2, t3, a2
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: xor a3, t3, t0
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: xor a0, a0, t0
+; RV32-NEXT: xor a0, a0, a7
; RV32-NEXT: or a0, a3, a0
; RV32-NEXT: snez a0, a0
-; RV32-NEXT: sw t6, 0(a4)
-; RV32-NEXT: sw a6, 4(a4)
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: sw a2, 0(a4)
+; RV32-NEXT: sw a5, 4(a4)
; RV32-NEXT: ret
;
; RV64-LABEL: smulo.i64:
@@ -1388,65 +1360,37 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV32ZBA-LABEL: smulo.i64:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: addi sp, sp, -16
-; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
-; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: .cfi_offset s0, -4
-; RV32ZBA-NEXT: .cfi_offset s1, -8
-; RV32ZBA-NEXT: mulhu a6, a0, a2
-; RV32ZBA-NEXT: mul a7, a1, a2
-; RV32ZBA-NEXT: mulhu t0, a1, a2
-; RV32ZBA-NEXT: mul t1, a0, a3
-; RV32ZBA-NEXT: mulhu t2, a0, a3
-; RV32ZBA-NEXT: mul a5, a1, a3
-; RV32ZBA-NEXT: srai t3, a1, 31
-; RV32ZBA-NEXT: srai t4, a3, 31
-; RV32ZBA-NEXT: mulhu t5, a1, a3
-; RV32ZBA-NEXT: mul t6, a0, a2
-; RV32ZBA-NEXT: add a6, a7, a6
-; RV32ZBA-NEXT: mul s0, a2, t3
-; RV32ZBA-NEXT: mul s1, t4, a0
-; RV32ZBA-NEXT: mulhu a2, a2, t3
-; RV32ZBA-NEXT: mul a3, a3, t3
-; RV32ZBA-NEXT: mul a1, t4, a1
-; RV32ZBA-NEXT: mulhu a0, t4, a0
-; RV32ZBA-NEXT: sltu a7, a6, a7
-; RV32ZBA-NEXT: add a6, t1, a6
-; RV32ZBA-NEXT: add t3, s1, s0
-; RV32ZBA-NEXT: add a2, a2, s0
+; RV32ZBA-NEXT: mulhu a5, a0, a2
+; RV32ZBA-NEXT: mul a6, a1, a2
+; RV32ZBA-NEXT: mulhsu a7, a1, a2
+; RV32ZBA-NEXT: mul t0, a3, a0
+; RV32ZBA-NEXT: mulh t1, a1, a3
+; RV32ZBA-NEXT: mul a1, a1, a3
+; RV32ZBA-NEXT: mulhsu a3, a3, a0
+; RV32ZBA-NEXT: mul a2, a0, a2
+; RV32ZBA-NEXT: add a5, a6, a5
+; RV32ZBA-NEXT: sltu a0, a5, a6
+; RV32ZBA-NEXT: add a5, t0, a5
+; RV32ZBA-NEXT: add a0, a7, a0
+; RV32ZBA-NEXT: sltu a6, a5, t0
+; RV32ZBA-NEXT: srai a7, a5, 31
+; RV32ZBA-NEXT: add a3, a3, a6
+; RV32ZBA-NEXT: srai a6, a0, 31
+; RV32ZBA-NEXT: add t0, a0, a3
+; RV32ZBA-NEXT: srai a3, a3, 31
+; RV32ZBA-NEXT: sltu a0, t0, a0
+; RV32ZBA-NEXT: add a3, a6, a3
+; RV32ZBA-NEXT: add t0, a1, t0
+; RV32ZBA-NEXT: add a0, a3, a0
+; RV32ZBA-NEXT: sltu a1, t0, a1
+; RV32ZBA-NEXT: xor a3, t0, a7
+; RV32ZBA-NEXT: add a0, t1, a0
; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: add a7, t0, a7
-; RV32ZBA-NEXT: sltu a1, a6, t1
-; RV32ZBA-NEXT: add a2, a2, a3
-; RV32ZBA-NEXT: add a0, a0, s1
-; RV32ZBA-NEXT: sltu a3, t3, s1
-; RV32ZBA-NEXT: srai t0, a6, 31
-; RV32ZBA-NEXT: add a1, t2, a1
-; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: add a1, a7, a1
-; RV32ZBA-NEXT: add a0, a0, a3
-; RV32ZBA-NEXT: add a2, a5, a1
-; RV32ZBA-NEXT: sltu a1, a1, a7
-; RV32ZBA-NEXT: add t3, a2, t3
-; RV32ZBA-NEXT: sltu a3, a2, a5
-; RV32ZBA-NEXT: add a1, t5, a1
-; RV32ZBA-NEXT: sltu a2, t3, a2
-; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: xor a3, t3, t0
-; RV32ZBA-NEXT: add a0, a1, a0
-; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: xor a0, a0, t0
+; RV32ZBA-NEXT: xor a0, a0, a7
; RV32ZBA-NEXT: or a0, a3, a0
; RV32ZBA-NEXT: snez a0, a0
-; RV32ZBA-NEXT: sw t6, 0(a4)
-; RV32ZBA-NEXT: sw a6, 4(a4)
-; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: .cfi_restore s0
-; RV32ZBA-NEXT: .cfi_restore s1
-; RV32ZBA-NEXT: addi sp, sp, 16
-; RV32ZBA-NEXT: .cfi_def_cfa_offset 0
+; RV32ZBA-NEXT: sw a2, 0(a4)
+; RV32ZBA-NEXT: sw a5, 4(a4)
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: smulo.i64:
@@ -1461,65 +1405,37 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
;
; RV32ZICOND-LABEL: smulo.i64:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: addi sp, sp, -16
-; RV32ZICOND-NEXT: .cfi_def_cfa_offset 16
-; RV32ZICOND-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZICOND-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
-; RV32ZICOND-NEXT: .cfi_offset s0, -4
-; RV32ZICOND-NEXT: .cfi_offset s1, -8
-; RV32ZICOND-NEXT: mulhu a6, a0, a2
-; RV32ZICOND-NEXT: mul a7, a1, a2
-; RV32ZICOND-NEXT: mulhu t0, a1, a2
-; RV32ZICOND-NEXT: mul t1, a0, a3
-; RV32ZICOND-NEXT: mulhu t2, a0, a3
-; RV32ZICOND-NEXT: mul a5, a1, a3
-; RV32ZICOND-NEXT: srai t3, a1, 31
-; RV32ZICOND-NEXT: srai t4, a3, 31
-; RV32ZICOND-NEXT: mulhu t5, a1, a3
-; RV32ZICOND-NEXT: mul t6, a0, a2
-; RV32ZICOND-NEXT: add a6, a7, a6
-; RV32ZICOND-NEXT: mul s0, a2, t3
-; RV32ZICOND-NEXT: mul s1, t4, a0
-; RV32ZICOND-NEXT: mulhu a2, a2, t3
-; RV32ZICOND-NEXT: mul a3, a3, t3
-; RV32ZICOND-NEXT: mul a1, t4, a1
-; RV32ZICOND-NEXT: mulhu a0, t4, a0
-; RV32ZICOND-NEXT: sltu a7, a6, a7
-; RV32ZICOND-NEXT: add a6, t1, a6
-; RV32ZICOND-NEXT: add t3, s1, s0
-; RV32ZICOND-NEXT: add a2, a2, s0
+; RV32ZICOND-NEXT: mulhu a5, a0, a2
+; RV32ZICOND-NEXT: mul a6, a1, a2
+; RV32ZICOND-NEXT: mulhsu a7, a1, a2
+; RV32ZICOND-NEXT: mul t0, a3, a0
+; RV32ZICOND-NEXT: mulh t1, a1, a3
+; RV32ZICOND-NEXT: mul a1, a1, a3
+; RV32ZICOND-NEXT: mulhsu a3, a3, a0
+; RV32ZICOND-NEXT: mul a2, a0, a2
+; RV32ZICOND-NEXT: add a5, a6, a5
+; RV32ZICOND-NEXT: sltu a0, a5, a6
+; RV32ZICOND-NEXT: add a5, t0, a5
+; RV32ZICOND-NEXT: add a0, a7, a0
+; RV32ZICOND-NEXT: sltu a6, a5, t0
+; RV32ZICOND-NEXT: srai a7, a5, 31
+; RV32ZICOND-NEXT: add a3, a3, a6
+; RV32ZICOND-NEXT: srai a6, a0, 31
+; RV32ZICOND-NEXT: add t0, a0, a3
+; RV32ZICOND-NEXT: srai a3, a3, 31
+; RV32ZICOND-NEXT: sltu a0, t0, a0
+; RV32ZICOND-NEXT: add a3, a6, a3
+; RV32ZICOND-NEXT: add t0, a1, t0
+; RV32ZICOND-NEXT: add a0, a3, a0
+; RV32ZICOND-NEXT: sltu a1, t0, a1
+; RV32ZICOND-NEXT: xor a3, t0, a7
+; RV32ZICOND-NEXT: add a0, t1, a0
; RV32ZICOND-NEXT: add a0, a0, a1
-; RV32ZICOND-NEXT: add a7, t0, a7
-; RV32ZICOND-NEXT: sltu a1, a6, t1
-; RV32ZICOND-NEXT: add a2, a2, a3
-; RV32ZICOND-NEXT: add a0, a0, s1
-; RV32ZICOND-NEXT: sltu a3, t3, s1
-; RV32ZICOND-NEXT: srai t0, a6, 31
-; RV32ZICOND-NEXT: add a1, t2, a1
-; RV32ZICOND-NEXT: add a0, a0, a2
-; RV32ZICOND-NEXT: add a1, a7, a1
-; RV32ZICOND-NEXT: add a0, a0, a3
-; RV32ZICOND-NEXT: add a2, a5, a1
-; RV32ZICOND-NEXT: sltu a1, a1, a7
-; RV32ZICOND-NEXT: add t3, a2, t3
-; RV32ZICOND-NEXT: sltu a3, a2, a5
-; RV32ZICOND-NEXT: add a1, t5, a1
-; RV32ZICOND-NEXT: sltu a2, t3, a2
-; RV32ZICOND-NEXT: add a1, a1, a3
-; RV32ZICOND-NEXT: xor a3, t3, t0
-; RV32ZICOND-NEXT: add a0, a1, a0
-; RV32ZICOND-NEXT: add a0, a0, a2
-; RV32ZICOND-NEXT: xor a0, a0, t0
+; RV32ZICOND-NEXT: xor a0, a0, a7
; RV32ZICOND-NEXT: or a0, a3, a0
; RV32ZICOND-NEXT: snez a0, a0
-; RV32ZICOND-NEXT: sw t6, 0(a4)
-; RV32ZICOND-NEXT: sw a6, 4(a4)
-; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZICOND-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
-; RV32ZICOND-NEXT: .cfi_restore s0
-; RV32ZICOND-NEXT: .cfi_restore s1
-; RV32ZICOND-NEXT: addi sp, sp, 16
-; RV32ZICOND-NEXT: .cfi_def_cfa_offset 0
+; RV32ZICOND-NEXT: sw a2, 0(a4)
+; RV32ZICOND-NEXT: sw a5, 4(a4)
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: smulo.i64:
@@ -1543,26 +1459,21 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
; RV32-LABEL: smulo2.i64:
; RV32: # %bb.0: # %entry
; RV32-NEXT: li a3, 13
-; RV32-NEXT: srai a4, a1, 31
-; RV32-NEXT: mulhu a5, a0, a3
-; RV32-NEXT: mul a6, a1, a3
-; RV32-NEXT: mulhu a1, a1, a3
-; RV32-NEXT: mul a7, a4, a3
-; RV32-NEXT: mulh a4, a4, a3
+; RV32-NEXT: mulhu a4, a0, a3
+; RV32-NEXT: mul a5, a1, a3
+; RV32-NEXT: mulh a1, a1, a3
; RV32-NEXT: mul a3, a0, a3
-; RV32-NEXT: add a5, a6, a5
-; RV32-NEXT: sltu a0, a5, a6
-; RV32-NEXT: srai a6, a5, 31
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: sltu a0, a4, a5
+; RV32-NEXT: srai a5, a4, 31
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a7, a0, a7
-; RV32-NEXT: xor a1, a7, a6
-; RV32-NEXT: sltu a0, a7, a0
-; RV32-NEXT: add a0, a4, a0
-; RV32-NEXT: xor a0, a0, a6
+; RV32-NEXT: xor a1, a0, a5
+; RV32-NEXT: srai a0, a0, 31
+; RV32-NEXT: xor a0, a0, a5
; RV32-NEXT: or a0, a1, a0
; RV32-NEXT: snez a0, a0
; RV32-NEXT: sw a3, 0(a2)
-; RV32-NEXT: sw a5, 4(a2)
+; RV32-NEXT: sw a4, 4(a2)
; RV32-NEXT: ret
;
; RV64-LABEL: smulo2.i64:
@@ -1580,28 +1491,22 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: li a3, 13
; RV32ZBA-NEXT: sh1add a4, a1, a1
-; RV32ZBA-NEXT: srai a5, a1, 31
-; RV32ZBA-NEXT: sh1add a6, a0, a0
-; RV32ZBA-NEXT: mulhu a7, a0, a3
+; RV32ZBA-NEXT: sh1add a5, a0, a0
; RV32ZBA-NEXT: sh2add a4, a4, a1
-; RV32ZBA-NEXT: mulhu a1, a1, a3
-; RV32ZBA-NEXT: sh1add t0, a5, a5
-; RV32ZBA-NEXT: mulh a3, a5, a3
-; RV32ZBA-NEXT: sh2add a6, a6, a0
-; RV32ZBA-NEXT: add a7, a4, a7
-; RV32ZBA-NEXT: sh2add a0, t0, a5
-; RV32ZBA-NEXT: sltu a4, a7, a4
-; RV32ZBA-NEXT: srai a5, a7, 31
-; RV32ZBA-NEXT: add a1, a1, a4
+; RV32ZBA-NEXT: mulh a1, a1, a3
+; RV32ZBA-NEXT: mulhu a3, a0, a3
+; RV32ZBA-NEXT: sh2add a5, a5, a0
+; RV32ZBA-NEXT: add a3, a4, a3
+; RV32ZBA-NEXT: sltu a0, a3, a4
+; RV32ZBA-NEXT: srai a4, a3, 31
; RV32ZBA-NEXT: add a0, a1, a0
-; RV32ZBA-NEXT: xor a4, a0, a5
-; RV32ZBA-NEXT: sltu a0, a0, a1
-; RV32ZBA-NEXT: add a0, a3, a0
-; RV32ZBA-NEXT: xor a0, a0, a5
-; RV32ZBA-NEXT: or a0, a4, a0
+; RV32ZBA-NEXT: xor a1, a0, a4
+; RV32ZBA-NEXT: srai a0, a0, 31
+; RV32ZBA-NEXT: xor a0, a0, a4
+; RV32ZBA-NEXT: or a0, a1, a0
; RV32ZBA-NEXT: snez a0, a0
-; RV32ZBA-NEXT: sw a6, 0(a2)
-; RV32ZBA-NEXT: sw a7, 4(a2)
+; RV32ZBA-NEXT: sw a5, 0(a2)
+; RV32ZBA-NEXT: sw a3, 4(a2)
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: smulo2.i64:
@@ -1619,26 +1524,21 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
; RV32ZICOND-LABEL: smulo2.i64:
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: li a3, 13
-; RV32ZICOND-NEXT: srai a4, a1, 31
-; RV32ZICOND-NEXT: mulhu a5, a0, a3
-; RV32ZICOND-NEXT: mul a6, a1, a3
-; RV32ZICOND-NEXT: mulhu a1, a1, a3
-; RV32ZICOND-NEXT: mul a7, a4, a3
-; RV32ZICOND-NEXT: mulh a4, a4, a3
+; RV32ZICOND-NEXT: mulhu a4, a0, a3
+; RV32ZICOND-NEXT: mul a5, a1, a3
+; RV32ZICOND-NEXT: mulh a1, a1, a3
; RV32ZICOND-NEXT: mul a3, a0, a3
-; RV32ZICOND-NEXT: add a5, a6, a5
-; RV32ZICOND-NEXT: sltu a0, a5, a6
-; RV32ZICOND-NEXT: srai a6, a5, 31
+; RV32ZICOND-NEXT: add a4, a5, a4
+; RV32ZICOND-NEXT: sltu a0, a4, a5
+; RV32ZICOND-NEXT: srai a5, a4, 31
; RV32ZICOND-NEXT: add a0, a1, a0
-; RV32ZICOND-NEXT: add a7, a0, a7
-; RV32ZICOND-NEXT: xor a1, a7, a6
-; RV32ZICOND-NEXT: sltu a0, a7, a0
-; RV32ZICOND-NEXT: add a0, a4, a0
-; RV32ZICOND-NEXT: xor a0, a0, a6
+; RV32ZICOND-NEXT: xor a1, a0, a5
+; RV32ZICOND-NEXT: srai a0, a0, 31
+; RV32ZICOND-NEXT: xor a0, a0, a5
; RV32ZICOND-NEXT: or a0, a1, a0
; RV32ZICOND-NEXT: snez a0, a0
; RV32ZICOND-NEXT: sw a3, 0(a2)
-; RV32ZICOND-NEXT: sw a5, 4(a2)
+; RV32ZICOND-NEXT: sw a4, 4(a2)
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: smulo2.i64:
@@ -3319,62 +3219,38 @@ entry:
define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32-LABEL: smulo.select.i64:
; RV32: # %bb.0: # %entry
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset s0, -4
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a1, a2
-; RV32-NEXT: mul a7, a0, a3
-; RV32-NEXT: mulhu t0, a0, a3
-; RV32-NEXT: mul t1, a1, a3
-; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mulhu t4, a1, a3
+; RV32-NEXT: mulhsu a6, a1, a2
+; RV32-NEXT: mul a7, a3, a0
+; RV32-NEXT: mulhsu t0, a3, a0
; RV32-NEXT: add a4, a5, a4
-; RV32-NEXT: mul t5, a2, t2
-; RV32-NEXT: mul t6, t3, a0
-; RV32-NEXT: mul s0, t3, a1
-; RV32-NEXT: mulhu t3, t3, a0
-; RV32-NEXT: add t3, t3, s0
-; RV32-NEXT: mulhu s0, a2, t2
-; RV32-NEXT: mul t2, a3, t2
; RV32-NEXT: sltu a5, a4, a5
; RV32-NEXT: add a4, a7, a4
-; RV32-NEXT: add s0, s0, t5
-; RV32-NEXT: add t5, t6, t5
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a6, a4, a7
-; RV32-NEXT: add t2, s0, t2
-; RV32-NEXT: add t3, t3, t6
-; RV32-NEXT: sltu a7, t5, t6
-; RV32-NEXT: srai a4, a4, 31
; RV32-NEXT: add a6, t0, a6
-; RV32-NEXT: add t2, t3, t2
+; RV32-NEXT: srai a7, a5, 31
+; RV32-NEXT: srai t0, a6, 31
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: mulh t0, a1, a3
; RV32-NEXT: add a6, a5, a6
-; RV32-NEXT: add a7, t2, a7
-; RV32-NEXT: add t0, t1, a6
; RV32-NEXT: sltu a5, a6, a5
-; RV32-NEXT: add t5, t0, t5
-; RV32-NEXT: sltu a6, t0, t1
-; RV32-NEXT: add a5, t4, a5
-; RV32-NEXT: sltu t0, t5, t0
-; RV32-NEXT: add a5, a5, a6
+; RV32-NEXT: add a5, a7, a5
+; RV32-NEXT: mul a7, a1, a3
+; RV32-NEXT: srai a4, a4, 31
+; RV32-NEXT: add a6, a7, a6
+; RV32-NEXT: sltu a7, a6, a7
+; RV32-NEXT: add a5, t0, a5
; RV32-NEXT: add a5, a5, a7
-; RV32-NEXT: add a5, a5, t0
; RV32-NEXT: xor a5, a5, a4
-; RV32-NEXT: xor a4, t5, a4
+; RV32-NEXT: xor a4, a6, a4
; RV32-NEXT: or a4, a4, a5
; RV32-NEXT: bnez a4, .LBB46_2
; RV32-NEXT: # %bb.1: # %entry
; RV32-NEXT: mv a0, a2
; RV32-NEXT: mv a1, a3
; RV32-NEXT: .LBB46_2: # %entry
-; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: smulo.select.i64:
@@ -3390,62 +3266,38 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
;
; RV32ZBA-LABEL: smulo.select.i64:
; RV32ZBA: # %bb.0: # %entry
-; RV32ZBA-NEXT: addi sp, sp, -16
-; RV32ZBA-NEXT: .cfi_def_cfa_offset 16
-; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT: .cfi_offset s0, -4
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a1, a2
-; RV32ZBA-NEXT: mul a7, a0, a3
-; RV32ZBA-NEXT: mulhu t0, a0, a3
-; RV32ZBA-NEXT: mul t1, a1, a3
-; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mulhu t4, a1, a3
+; RV32ZBA-NEXT: mulhsu a6, a1, a2
+; RV32ZBA-NEXT: mul a7, a3, a0
+; RV32ZBA-NEXT: mulhsu t0, a3, a0
; RV32ZBA-NEXT: add a4, a5, a4
-; RV32ZBA-NEXT: mul t5, a2, t2
-; RV32ZBA-NEXT: mul t6, t3, a0
-; RV32ZBA-NEXT: mul s0, t3, a1
-; RV32ZBA-NEXT: mulhu t3, t3, a0
-; RV32ZBA-NEXT: add t3, t3, s0
-; RV32ZBA-NEXT: mulhu s0, a2, t2
-; RV32ZBA-NEXT: mul t2, a3, t2
; RV32ZBA-NEXT: sltu a5, a4, a5
; RV32ZBA-NEXT: add a4, a7, a4
-; RV32ZBA-NEXT: add s0, s0, t5
-; RV32ZBA-NEXT: add t5, t6, t5
; RV32ZBA-NEXT: add a5, a6, a5
; RV32ZBA-NEXT: sltu a6, a4, a7
-; RV32ZBA-NEXT: add t2, s0, t2
-; RV32ZBA-NEXT: add t3, t3, t6
-; RV32ZBA-NEXT: sltu a7, t5, t6
-; RV32ZBA-NEXT: srai a4, a4, 31
; RV32ZBA-NEXT: add a6, t0, a6
-; RV32ZBA-NEXT: add t2, t3, t2
+; RV32ZBA-NEXT: srai a7, a5, 31
+; RV32ZBA-NEXT: srai t0, a6, 31
+; RV32ZBA-NEXT: add a7, a7, t0
+; RV32ZBA-NEXT: mulh t0, a1, a3
; RV32ZBA-NEXT: add a6, a5, a6
-; RV32ZBA-NEXT: add a7, t2, a7
-; RV32ZBA-NEXT: add t0, t1, a6
; RV32ZBA-NEXT: sltu a5, a6, a5
-; RV32ZBA-NEXT: add t5, t0, t5
-; RV32ZBA-NEXT: sltu a6, t0, t1
-; RV32ZBA-NEXT: add a5, t4, a5
-; RV32ZBA-NEXT: sltu t0, t5, t0
-; RV32ZBA-NEXT: add a5, a5, a6
+; RV32ZBA-NEXT: add a5, a7, a5
+; RV32ZBA-NEXT: mul a7, a1, a3
+; RV32ZBA-NEXT: srai a4, a4, 31
+; RV32ZBA-NEXT: add a6, a7, a6
+; RV32ZBA-NEXT: sltu a7, a6, a7
+; RV32ZBA-NEXT: add a5, t0, a5
; RV32ZBA-NEXT: add a5, a5, a7
-; RV32ZBA-NEXT: add a5, a5, t0
; RV32ZBA-NEXT: xor a5, a5, a4
-; RV32ZBA-NEXT: xor a4, t5, a4
+; RV32ZBA-NEXT: xor a4, a6, a4
; RV32ZBA-NEXT: or a4, a4, a5
; RV32ZBA-NEXT: bnez a4, .LBB46_2
; RV32ZBA-NEXT: # %bb.1: # %entry
; RV32ZBA-NEXT: mv a0, a2
; RV32ZBA-NEXT: mv a1, a3
; RV32ZBA-NEXT: .LBB46_2: # %entry
-; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT: .cfi_restore s0
-; RV32ZBA-NEXT: addi sp, sp, 16
-; RV32ZBA-NEXT: .cfi_def_cfa_offset 0
; RV32ZBA-NEXT: ret
;
; RV64ZBA-LABEL: smulo.select.i64:
@@ -3461,51 +3313,31 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
;
; RV32ZICOND-LABEL: smulo.select.i64:
; RV32ZICOND: # %bb.0: # %entry
-; RV32ZICOND-NEXT: addi sp, sp, -16
-; RV32ZICOND-NEXT: .cfi_def_cfa_offset 16
-; RV32ZICOND-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZICOND-NEXT: .cfi_offset s0, -4
; RV32ZICOND-NEXT: mulhu a4, a0, a2
; RV32ZICOND-NEXT: mul a5, a1, a2
-; RV32ZICOND-NEXT: mulhu a6, a1, a2
-; RV32ZICOND-NEXT: mul a7, a0, a3
-; RV32ZICOND-NEXT: mulhu t0, a0, a3
-; RV32ZICOND-NEXT: mul t1, a1, a3
-; RV32ZICOND-NEXT: srai t2, a1, 31
-; RV32ZICOND-NEXT: srai t3, a3, 31
-; RV32ZICOND-NEXT: mulhu t4, a1, a3
+; RV32ZICOND-NEXT: mulhsu a6, a1, a2
+; RV32ZICOND-NEXT: mul a7, a3, a0
+; RV32ZICOND-NEXT: mulhsu t0, a3, a0
; RV32ZICOND-NEXT: add a4, a5, a4
-; RV32ZICOND-NEXT: mul t5, a2, t2
-; RV32ZICOND-NEXT: mul t6, t3, a0
-; RV32ZICOND-NEXT: mul s0, t3, a1
-; RV32ZICOND-NEXT: mulhu t3, t3, a0
-; RV32ZICOND-NEXT: add t3, t3, s0
-; RV32ZICOND-NEXT: mulhu s0, a2, t2
-; RV32ZICOND-NEXT: mul t2, a3, t2
; RV32ZICOND-NEXT: sltu a5, a4, a5
; RV32ZICOND-NEXT: add a4, a7, a4
-; RV32ZICOND-NEXT: add s0, s0, t5
-; RV32ZICOND-NEXT: add t5, t6, t5
; RV32ZICOND-NEXT: add a5, a6, a5
; RV32ZICOND-NEXT: sltu a6, a4, a7
-; RV32ZICOND-NEXT: add t2, s0, t2
-; RV32ZICOND-NEXT: add t3, t3, t6
-; RV32ZICOND-NEXT: sltu a7, t5, t6
-; RV32ZICOND-NEXT: srai a4, a4, 31
; RV32ZICOND-NEXT: add a6, t0, a6
-; RV32ZICOND-NEXT: add t2, t3, t2
+; RV32ZICOND-NEXT: srai a7, a5, 31
+; RV32ZICOND-NEXT: srai t0, a6, 31
+; RV32ZICOND-NEXT: add a7, a7, t0
+; RV32ZICOND-NEXT: mulh t0, a1, a3
; RV32ZICOND-NEXT: add a6, a5, a6
-; RV32ZICOND-NEXT: add a7, t2, a7
-; RV32ZICOND-NEXT: add t0, t1, a6
; RV32ZICOND-NEXT: sltu a5, a6, a5
-; RV32ZICOND-NEXT: add t5, t0, t5
-; RV32ZICOND-NEXT: sltu a6, t0, t1
-; RV32ZICOND-NEXT: add a5, t4, a5
-; RV32ZICOND-NEXT: sltu t0, t5, t0
-; RV32ZICOND-NEXT: add a5, a5, a6
-; RV32ZICOND-NEXT: xor a6, t5, a4
+; RV32ZICOND-NEXT: add a5, a7, a5
+; RV32ZICOND-NEXT: mul a7, a1, a3
+; RV32ZICOND-NEXT: srai a4, a4, 31
+; RV32ZICOND-NEXT: add a6, a7, a6
+; RV32ZICOND-NEXT: sltu a7, a6, a7
+; RV32ZICOND-NEXT: xor a6, a6, a4
+; RV32ZICOND-NEXT: add a5, t0, a5
; RV32ZICOND-NEXT: add a5, a5, a7
-; RV32ZICOND-NEXT: add a5, a5, t0
; RV32ZICOND-NEXT: xor a4, a5, a4
; RV32ZICOND-NEXT: or a4, a6, a4
; RV32ZICOND-NEXT: czero.nez a2, a2, a4
@@ -3514,10 +3346,6 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
; RV32ZICOND-NEXT: czero.eqz a1, a1, a4
; RV32ZICOND-NEXT: or a0, a0, a2
; RV32ZICOND-NEXT: or a1, a1, a3
-; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZICOND-NEXT: .cfi_restore s0
-; RV32ZICOND-NEXT: addi sp, sp, 16
-; RV32ZICOND-NEXT: .cfi_def_cfa_offset 0
; RV32ZICOND-NEXT: ret
;
; RV64ZICOND-LABEL: smulo.select.i64:
@@ -3542,47 +3370,31 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32: # %bb.0: # %entry
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a1, a2
-; RV32-NEXT: mul a7, a0, a3
-; RV32-NEXT: mulhu t0, a0, a3
-; RV32-NEXT: mul t1, a1, a3
-; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mulhu t4, a1, a3
+; RV32-NEXT: mulhsu a2, a1, a2
+; RV32-NEXT: mul a6, a3, a0
+; RV32-NEXT: mulhsu a0, a3, a0
+; RV32-NEXT: mulh a7, a1, a3
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a4, a5, a4
-; RV32-NEXT: mul t5, a2, t2
-; RV32-NEXT: mul t6, t3, a0
-; RV32-NEXT: mulhu a2, a2, t2
-; RV32-NEXT: mul a3, a3, t2
-; RV32-NEXT: mul a1, t3, a1
-; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: sltu a5, a4, a5
-; RV32-NEXT: add a4, a7, a4
-; RV32-NEXT: add t2, t6, t5
-; RV32-NEXT: add a2, a2, t5
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a5, a6, a5
-; RV32-NEXT: sltu a1, a4, a7
+; RV32-NEXT: sltu a3, a4, a5
+; RV32-NEXT: add a4, a6, a4
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, t6
-; RV32-NEXT: sltu a3, t2, t6
+; RV32-NEXT: sltu a3, a4, a6
; RV32-NEXT: srai a4, a4, 31
-; RV32-NEXT: add a1, t0, a1
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a1, a5, a1
; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a2, t1, a1
-; RV32-NEXT: sltu a1, a1, a5
-; RV32-NEXT: add t2, a2, t2
-; RV32-NEXT: sltu a3, a2, t1
-; RV32-NEXT: add a1, t4, a1
-; RV32-NEXT: sltu a2, t2, a2
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: srai a3, a2, 31
+; RV32-NEXT: add a5, a2, a0
+; RV32-NEXT: srai a0, a0, 31
+; RV32-NEXT: sltu a2, a5, a2
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: add a5, a1, a5
; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: sltu a1, a5, a1
+; RV32-NEXT: add a0, a7, a0
+; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: xor a0, a0, a4
-; RV32-NEXT: xor a1, t2, a4
-; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: xor a4, a5, a4
+; RV32-NEXT: or a0, a4, a0
; RV32-NEXT: seqz a0, a0
; RV32-NEXT: ret
;
@@ -3599,47 +3411,31 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a1, a2
-; RV32ZBA-NEXT: mul a7, a0, a3
-; RV32ZBA-NEXT: mulhu t0, a0, a3
-; RV32ZBA-NEXT: mul t1, a1, a3
-; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mulhu t4, a1, a3
+; RV32ZBA-NEXT: mulhsu a2, a1, a2
+; RV32ZBA-NEXT: mul a6, a3, a0
+; RV32ZBA-NEXT: mulhsu a0, a3, a0
+; RV32ZBA-NEXT: mulh a7, a1, a3
+; RV32ZBA-NEXT: mul a1, a1, a3
; RV32ZBA-NEXT: add a4, a5, a4
-; RV32ZBA-NEXT: mul t5, a2, t2
-; RV32ZBA-NEXT: mul t6, t3, a0
-; RV32ZBA-NEXT: mulhu a2, a2, t2
-; RV32ZBA-NEXT: mul a3, a3, t2
-; RV32ZBA-NEXT: mul a1, t3, a1
-; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: sltu a5, a4, a5
-; RV32ZBA-NEXT: add a4, a7, a4
-; RV32ZBA-NEXT: add t2, t6, t5
-; RV32ZBA-NEXT: add a2, a2, t5
-; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: add a5, a6, a5
-; RV32ZBA-NEXT: sltu a1, a4, a7
+; RV32ZBA-NEXT: sltu a3, a4, a5
+; RV32ZBA-NEXT: add a4, a6, a4
; RV32ZBA-NEXT: add a2, a2, a3
-; RV32ZBA-NEXT: add a0, a0, t6
-; RV32ZBA-NEXT: sltu a3, t2, t6
+; RV32ZBA-NEXT: sltu a3, a4, a6
; RV32ZBA-NEXT: srai a4, a4, 31
-; RV32ZBA-NEXT: add a1, t0, a1
-; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: add a1, a5, a1
; RV32ZBA-NEXT: add a0, a0, a3
-; RV32ZBA-NEXT: add a2, t1, a1
-; RV32ZBA-NEXT: sltu a1, a1, a5
-; RV32ZBA-NEXT: add t2, a2, t2
-; RV32ZBA-NEXT: sltu a3, a2, t1
-; RV32ZBA-NEXT: add a1, t4, a1
-; RV32ZBA-NEXT: sltu a2, t2, a2
-; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: add a0, a1, a0
+; RV32ZBA-NEXT: srai a3, a2, 31
+; RV32ZBA-NEXT: add a5, a2, a0
+; RV32ZBA-NEXT: srai a0, a0, 31
+; RV32ZBA-NEXT: sltu a2, a5, a2
+; RV32ZBA-NEXT: add a0, a3, a0
+; RV32ZBA-NEXT: add a5, a1, a5
; RV32ZBA-NEXT: add a0, a0, a2
+; RV32ZBA-NEXT: sltu a1, a5, a1
+; RV32ZBA-NEXT: add a0, a7, a0
+; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: xor a0, a0, a4
-; RV32ZBA-NEXT: xor a1, t2, a4
-; RV32ZBA-NEXT: or a0, a1, a0
+; RV32ZBA-NEXT: xor a4, a5, a4
+; RV32ZBA-NEXT: or a0, a4, a0
; RV32ZBA-NEXT: seqz a0, a0
; RV32ZBA-NEXT: ret
;
@@ -3656,47 +3452,31 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: mulhu a4, a0, a2
; RV32ZICOND-NEXT: mul a5, a1, a2
-; RV32ZICOND-NEXT: mulhu a6, a1, a2
-; RV32ZICOND-NEXT: mul a7, a0, a3
-; RV32ZICOND-NEXT: mulhu t0, a0, a3
-; RV32ZICOND-NEXT: mul t1, a1, a3
-; RV32ZICOND-NEXT: srai t2, a1, 31
-; RV32ZICOND-NEXT: srai t3, a3, 31
-; RV32ZICOND-NEXT: mulhu t4, a1, a3
+; RV32ZICOND-NEXT: mulhsu a2, a1, a2
+; RV32ZICOND-NEXT: mul a6, a3, a0
+; RV32ZICOND-NEXT: mulhsu a0, a3, a0
+; RV32ZICOND-NEXT: mulh a7, a1, a3
+; RV32ZICOND-NEXT: mul a1, a1, a3
; RV32ZICOND-NEXT: add a4, a5, a4
-; RV32ZICOND-NEXT: mul t5, a2, t2
-; RV32ZICOND-NEXT: mul t6, t3, a0
-; RV32ZICOND-NEXT: mulhu a2, a2, t2
-; RV32ZICOND-NEXT: mul a3, a3, t2
-; RV32ZICOND-NEXT: mul a1, t3, a1
-; RV32ZICOND-NEXT: mulhu a0, t3, a0
-; RV32ZICOND-NEXT: sltu a5, a4, a5
-; RV32ZICOND-NEXT: add a4, a7, a4
-; RV32ZICOND-NEXT: add t2, t6, t5
-; RV32ZICOND-NEXT: add a2, a2, t5
-; RV32ZICOND-NEXT: add a0, a0, a1
-; RV32ZICOND-NEXT: add a5, a6, a5
-; RV32ZICOND-NEXT: sltu a1, a4, a7
+; RV32ZICOND-NEXT: sltu a3, a4, a5
+; RV32ZICOND-NEXT: add a4, a6, a4
; RV32ZICOND-NEXT: add a2, a2, a3
-; RV32ZICOND-NEXT: add a0, a0, t6
-; RV32ZICOND-NEXT: sltu a3, t2, t6
+; RV32ZICOND-NEXT: sltu a3, a4, a6
; RV32ZICOND-NEXT: srai a4, a4, 31
-; RV32ZICOND-NEXT: add a1, t0, a1
-; RV32ZICOND-NEXT: add a0, a0, a2
-; RV32ZICOND-NEXT: add a1, a5, a1
; RV32ZICOND-NEXT: add a0, a0, a3
-; RV32ZICOND-NEXT: add a2, t1, a1
-; RV32ZICOND-NEXT: sltu a1, a1, a5
-; RV32ZICOND-NEXT: add t2, a2, t2
-; RV32ZICOND-NEXT: sltu a3, a2, t1
-; RV32ZICOND-NEXT: add a1, t4, a1
-; RV32ZICOND-NEXT: sltu a2, t2, a2
-; RV32ZICOND-NEXT: add a1, a1, a3
-; RV32ZICOND-NEXT: add a0, a1, a0
+; RV32ZICOND-NEXT: srai a3, a2, 31
+; RV32ZICOND-NEXT: add a5, a2, a0
+; RV32ZICOND-NEXT: srai a0, a0, 31
+; RV32ZICOND-NEXT: sltu a2, a5, a2
+; RV32ZICOND-NEXT: add a0, a3, a0
+; RV32ZICOND-NEXT: add a5, a1, a5
; RV32ZICOND-NEXT: add a0, a0, a2
+; RV32ZICOND-NEXT: sltu a1, a5, a1
+; RV32ZICOND-NEXT: add a0, a7, a0
+; RV32ZICOND-NEXT: add a0, a0, a1
; RV32ZICOND-NEXT: xor a0, a0, a4
-; RV32ZICOND-NEXT: xor a1, t2, a4
-; RV32ZICOND-NEXT: or a0, a1, a0
+; RV32ZICOND-NEXT: xor a4, a5, a4
+; RV32ZICOND-NEXT: or a0, a4, a0
; RV32ZICOND-NEXT: seqz a0, a0
; RV32ZICOND-NEXT: ret
;
@@ -4879,47 +4659,31 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32: # %bb.0: # %entry
; RV32-NEXT: mulhu a4, a0, a2
; RV32-NEXT: mul a5, a1, a2
-; RV32-NEXT: mulhu a6, a1, a2
-; RV32-NEXT: mul a7, a0, a3
-; RV32-NEXT: mulhu t0, a0, a3
-; RV32-NEXT: mul t1, a1, a3
-; RV32-NEXT: srai t2, a1, 31
-; RV32-NEXT: srai t3, a3, 31
-; RV32-NEXT: mulhu t4, a1, a3
+; RV32-NEXT: mulhsu a2, a1, a2
+; RV32-NEXT: mul a6, a3, a0
+; RV32-NEXT: mulhsu a0, a3, a0
+; RV32-NEXT: mulh a7, a1, a3
+; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a4, a5, a4
-; RV32-NEXT: mul t5, a2, t2
-; RV32-NEXT: mul t6, t3, a0
-; RV32-NEXT: mulhu a2, a2, t2
-; RV32-NEXT: mul a3, a3, t2
-; RV32-NEXT: mul a1, t3, a1
-; RV32-NEXT: mulhu a0, t3, a0
-; RV32-NEXT: sltu a5, a4, a5
-; RV32-NEXT: add a4, a7, a4
-; RV32-NEXT: add t2, t6, t5
-; RV32-NEXT: add a2, a2, t5
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: add a5, a6, a5
-; RV32-NEXT: sltu a1, a4, a7
+; RV32-NEXT: sltu a3, a4, a5
+; RV32-NEXT: add a4, a6, a4
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: add a0, a0, t6
-; RV32-NEXT: sltu a3, t2, t6
+; RV32-NEXT: sltu a3, a4, a6
; RV32-NEXT: srai a4, a4, 31
-; RV32-NEXT: add a1, t0, a1
-; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: add a1, a5, a1
; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: add a2, t1, a1
-; RV32-NEXT: sltu a1, a1, a5
-; RV32-NEXT: add t2, a2, t2
-; RV32-NEXT: sltu a3, a2, t1
-; RV32-NEXT: add a1, t4, a1
-; RV32-NEXT: sltu a2, t2, a2
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: srai a3, a2, 31
+; RV32-NEXT: add a5, a2, a0
+; RV32-NEXT: srai a0, a0, 31
+; RV32-NEXT: sltu a2, a5, a2
+; RV32-NEXT: add a0, a3, a0
+; RV32-NEXT: add a5, a1, a5
; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: sltu a1, a5, a1
+; RV32-NEXT: add a0, a7, a0
+; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: xor a0, a0, a4
-; RV32-NEXT: xor a1, t2, a4
-; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: xor a4, a5, a4
+; RV32-NEXT: or a0, a4, a0
; RV32-NEXT: beqz a0, .LBB61_2
; RV32-NEXT: # %bb.1: # %overflow
; RV32-NEXT: li a0, 0
@@ -4945,47 +4709,31 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZBA: # %bb.0: # %entry
; RV32ZBA-NEXT: mulhu a4, a0, a2
; RV32ZBA-NEXT: mul a5, a1, a2
-; RV32ZBA-NEXT: mulhu a6, a1, a2
-; RV32ZBA-NEXT: mul a7, a0, a3
-; RV32ZBA-NEXT: mulhu t0, a0, a3
-; RV32ZBA-NEXT: mul t1, a1, a3
-; RV32ZBA-NEXT: srai t2, a1, 31
-; RV32ZBA-NEXT: srai t3, a3, 31
-; RV32ZBA-NEXT: mulhu t4, a1, a3
+; RV32ZBA-NEXT: mulhsu a2, a1, a2
+; RV32ZBA-NEXT: mul a6, a3, a0
+; RV32ZBA-NEXT: mulhsu a0, a3, a0
+; RV32ZBA-NEXT: mulh a7, a1, a3
+; RV32ZBA-NEXT: mul a1, a1, a3
; RV32ZBA-NEXT: add a4, a5, a4
-; RV32ZBA-NEXT: mul t5, a2, t2
-; RV32ZBA-NEXT: mul t6, t3, a0
-; RV32ZBA-NEXT: mulhu a2, a2, t2
-; RV32ZBA-NEXT: mul a3, a3, t2
-; RV32ZBA-NEXT: mul a1, t3, a1
-; RV32ZBA-NEXT: mulhu a0, t3, a0
-; RV32ZBA-NEXT: sltu a5, a4, a5
-; RV32ZBA-NEXT: add a4, a7, a4
-; RV32ZBA-NEXT: add t2, t6, t5
-; RV32ZBA-NEXT: add a2, a2, t5
-; RV32ZBA-NEXT: add a0, a0, a1
-; RV32ZBA-NEXT: add a5, a6, a5
-; RV32ZBA-NEXT: sltu a1, a4, a7
+; RV32ZBA-NEXT: sltu a3, a4, a5
+; RV32ZBA-NEXT: add a4, a6, a4
; RV32ZBA-NEXT: add a2, a2, a3
-; RV32ZBA-NEXT: add a0, a0, t6
-; RV32ZBA-NEXT: sltu a3, t2, t6
+; RV32ZBA-NEXT: sltu a3, a4, a6
; RV32ZBA-NEXT: srai a4, a4, 31
-; RV32ZBA-NEXT: add a1, t0, a1
-; RV32ZBA-NEXT: add a0, a0, a2
-; RV32ZBA-NEXT: add a1, a5, a1
; RV32ZBA-NEXT: add a0, a0, a3
-; RV32ZBA-NEXT: add a2, t1, a1
-; RV32ZBA-NEXT: sltu a1, a1, a5
-; RV32ZBA-NEXT: add t2, a2, t2
-; RV32ZBA-NEXT: sltu a3, a2, t1
-; RV32ZBA-NEXT: add a1, t4, a1
-; RV32ZBA-NEXT: sltu a2, t2, a2
-; RV32ZBA-NEXT: add a1, a1, a3
-; RV32ZBA-NEXT: add a0, a1, a0
+; RV32ZBA-NEXT: srai a3, a2, 31
+; RV32ZBA-NEXT: add a5, a2, a0
+; RV32ZBA-NEXT: srai a0, a0, 31
+; RV32ZBA-NEXT: sltu a2, a5, a2
+; RV32ZBA-NEXT: add a0, a3, a0
+; RV32ZBA-NEXT: add a5, a1, a5
; RV32ZBA-NEXT: add a0, a0, a2
+; RV32ZBA-NEXT: sltu a1, a5, a1
+; RV32ZBA-NEXT: add a0, a7, a0
+; RV32ZBA-NEXT: add a0, a0, a1
; RV32ZBA-NEXT: xor a0, a0, a4
-; RV32ZBA-NEXT: xor a1, t2, a4
-; RV32ZBA-NEXT: or a0, a1, a0
+; RV32ZBA-NEXT: xor a4, a5, a4
+; RV32ZBA-NEXT: or a0, a4, a0
; RV32ZBA-NEXT: beqz a0, .LBB61_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
; RV32ZBA-NEXT: li a0, 0
@@ -5011,47 +4759,31 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
; RV32ZICOND: # %bb.0: # %entry
; RV32ZICOND-NEXT: mulhu a4, a0, a2
; RV32ZICOND-NEXT: mul a5, a1, a2
-; RV32ZICOND-NEXT: mulhu a6, a1, a2
-; RV32ZICOND-NEXT: mul a7, a0, a3
-; RV32ZICOND-NEXT: mulhu t0, a0, a3
-; RV32ZICOND-NEXT: mul t1, a1, a3
-; RV32ZICOND-NEXT: srai t2, a1, 31
-; RV32ZICOND-NEXT: srai t3, a3, 31
-; RV32ZICOND-NEXT: mulhu t4, a1, a3
+; RV32ZICOND-NEXT: mulhsu a2, a1, a2
+; RV32ZICOND-NEXT: mul a6, a3, a0
+; RV32ZICOND-NEXT: mulhsu a0, a3, a0
+; RV32ZICOND-NEXT: mulh a7, a1, a3
+; RV32ZICOND-NEXT: mul a1, a1, a3
; RV32ZICOND-NEXT: add a4, a5, a4
-; RV32ZICOND-NEXT: mul t5, a2, t2
-; RV32ZICOND-NEXT: mul t6, t3, a0
-; RV32ZICOND-NEXT: mulhu a2, a2, t2
-; RV32ZICOND-NEXT: mul a3, a3, t2
-; RV32ZICOND-NEXT: mul a1, t3, a1
-; RV32ZICOND-NEXT: mulhu a0, t3, a0
-; RV32ZICOND-NEXT: sltu a5, a4, a5
-; RV32ZICOND-NEXT: add a4, a7, a4
-; RV32ZICOND-NEXT: add t2, t6, t5
-; RV32ZICOND-NEXT: add a2, a2, t5
-; RV32ZICOND-NEXT: add a0, a0, a1
-; RV32ZICOND-NEXT: add a5, a6, a5
-; RV32ZICOND-NEXT: sltu a1, a4, a7
+; RV32ZICOND-NEXT: sltu a3, a4, a5
+; RV32ZICOND-NEXT: add a4, a6, a4
; RV32ZICOND-NEXT: add a2, a2, a3
-; RV32ZICOND-NEXT: add a0, a0, t6
-; RV32ZICOND-NEXT: sltu a3, t2, t6
+; RV32ZICOND-NEXT: sltu a3, a4, a6
; RV32ZICOND-NEXT: srai a4, a4, 31
-; RV32ZICOND-NEXT: add a1, t0, a1
-; RV32ZICOND-NEXT: add a0, a0, a2
-; RV32ZICOND-NEXT: add a1, a5, a1
; RV32ZICOND-NEXT: add a0, a0, a3
-; RV32ZICOND-NEXT: add a2, t1, a1
-; RV32ZICOND-NEXT: sltu a1, a1, a5
-; RV32ZICOND-NEXT: add t2, a2, t2
-; RV32ZICOND-NEXT: sltu a3, a2, t1
-; RV32ZICOND-NEXT: add a1, t4, a1
-; RV32ZICOND-NEXT: sltu a2, t2, a2
-; RV32ZICOND-NEXT: add a1, a1, a3
-; RV32ZICOND-NEXT: add a0, a1, a0
+; RV32ZICOND-NEXT: srai a3, a2, 31
+; RV32ZICOND-NEXT: add a5, a2, a0
+; RV32ZICOND-NEXT: srai a0, a0, 31
+; RV32ZICOND-NEXT: sltu a2, a5, a2
+; RV32ZICOND-NEXT: add a0, a3, a0
+; RV32ZICOND-NEXT: add a5, a1, a5
; RV32ZICOND-NEXT: add a0, a0, a2
+; RV32ZICOND-NEXT: sltu a1, a5, a1
+; RV32ZICOND-NEXT: add a0, a7, a0
+; RV32ZICOND-NEXT: add a0, a0, a1
; RV32ZICOND-NEXT: xor a0, a0, a4
-; RV32ZICOND-NEXT: xor a1, t2, a4
-; RV32ZICOND-NEXT: or a0, a1, a0
+; RV32ZICOND-NEXT: xor a4, a5, a4
+; RV32ZICOND-NEXT: or a0, a4, a0
; RV32ZICOND-NEXT: beqz a0, .LBB61_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
; RV32ZICOND-NEXT: li a0, 0
@@ -5091,40 +4823,31 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32-NEXT: li a2, -13
; RV32-NEXT: neg a3, a0
; RV32-NEXT: li a4, -1
-; RV32-NEXT: srai a5, a1, 31
-; RV32-NEXT: neg a6, a1
-; RV32-NEXT: add a7, a0, a1
-; RV32-NEXT: mulhu t0, a0, a2
-; RV32-NEXT: mul t1, a1, a2
-; RV32-NEXT: mulhu t2, a1, a2
-; RV32-NEXT: mulhu t3, a0, a4
-; RV32-NEXT: mul t4, a5, a2
-; RV32-NEXT: mulhu a4, a1, a4
-; RV32-NEXT: mulh a2, a5, a2
-; RV32-NEXT: add t0, t1, t0
-; RV32-NEXT: sub a5, t4, a0
-; RV32-NEXT: sub a7, t3, a7
-; RV32-NEXT: sltu t1, t0, t1
-; RV32-NEXT: sub a0, t0, a0
-; RV32-NEXT: sltu t0, a5, a3
-; RV32-NEXT: add a2, a7, a2
-; RV32-NEXT: add t1, t2, t1
-; RV32-NEXT: sltu a3, a0, a3
-; RV32-NEXT: add a2, a2, t0
-; RV32-NEXT: srai a0, a0, 31
-; RV32-NEXT: add a3, t3, a3
-; RV32-NEXT: add a3, t1, a3
-; RV32-NEXT: sub a1, a3, a1
-; RV32-NEXT: sltu a3, a3, t1
-; RV32-NEXT: add a5, a1, a5
-; RV32-NEXT: sltu a6, a1, a6
-; RV32-NEXT: add a3, a4, a3
-; RV32-NEXT: sltu a1, a5, a1
+; RV32-NEXT: mulhu a5, a0, a2
+; RV32-NEXT: mul a6, a1, a2
+; RV32-NEXT: mulhsu a2, a1, a2
+; RV32-NEXT: add a5, a6, a5
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: sub a5, a5, a0
+; RV32-NEXT: mulhsu a0, a4, a0
+; RV32-NEXT: add a2, a2, a6
+; RV32-NEXT: sltu a3, a5, a3
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: srai a3, a2, 31
+; RV32-NEXT: srai a6, a0, 31
; RV32-NEXT: add a3, a3, a6
+; RV32-NEXT: neg a6, a1
+; RV32-NEXT: mulh a4, a1, a4
+; RV32-NEXT: srai a5, a5, 31
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: sltu a2, a0, a2
+; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: add a2, a3, a2
+; RV32-NEXT: sltu a1, a0, a6
+; RV32-NEXT: add a2, a4, a2
; RV32-NEXT: add a1, a2, a1
-; RV32-NEXT: xor a1, a1, a0
-; RV32-NEXT: xor a0, a5, a0
+; RV32-NEXT: xor a1, a1, a5
+; RV32-NEXT: xor a0, a0, a5
; RV32-NEXT: or a0, a0, a1
; RV32-NEXT: beqz a0, .LBB62_2
; RV32-NEXT: # %bb.1: # %overflow
@@ -5153,40 +4876,31 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32ZBA-NEXT: li a2, -13
; RV32ZBA-NEXT: neg a3, a0
; RV32ZBA-NEXT: li a4, -1
-; RV32ZBA-NEXT: srai a5, a1, 31
-; RV32ZBA-NEXT: neg a6, a1
-; RV32ZBA-NEXT: add a7, a0, a1
-; RV32ZBA-NEXT: mulhu t0, a0, a2
-; RV32ZBA-NEXT: mul t1, a1, a2
-; RV32ZBA-NEXT: mulhu t2, a1, a2
-; RV32ZBA-NEXT: mulhu t3, a0, a4
-; RV32ZBA-NEXT: mul t4, a5, a2
-; RV32ZBA-NEXT: mulhu a4, a1, a4
-; RV32ZBA-NEXT: mulh a2, a5, a2
-; RV32ZBA-NEXT: add t0, t1, t0
-; RV32ZBA-NEXT: sub a5, t4, a0
-; RV32ZBA-NEXT: sub a7, t3, a7
-; RV32ZBA-NEXT: sltu t1, t0, t1
-; RV32ZBA-NEXT: sub a0, t0, a0
-; RV32ZBA-NEXT: sltu t0, a5, a3
-; RV32ZBA-NEXT: add a2, a7, a2
-; RV32ZBA-NEXT: add t1, t2, t1
-; RV32ZBA-NEXT: sltu a3, a0, a3
-; RV32ZBA-NEXT: add a2, a2, t0
-; RV32ZBA-NEXT: srai a0, a0, 31
-; RV32ZBA-NEXT: add a3, t3, a3
-; RV32ZBA-NEXT: add a3, t1, a3
-; RV32ZBA-NEXT: sub a1, a3, a1
-; RV32ZBA-NEXT: sltu a3, a3, t1
-; RV32ZBA-NEXT: add a5, a1, a5
-; RV32ZBA-NEXT: sltu a6, a1, a6
-; RV32ZBA-NEXT: add a3, a4, a3
-; RV32ZBA-NEXT: sltu a1, a5, a1
+; RV32ZBA-NEXT: mulhu a5, a0, a2
+; RV32ZBA-NEXT: mul a6, a1, a2
+; RV32ZBA-NEXT: mulhsu a2, a1, a2
+; RV32ZBA-NEXT: add a5, a6, a5
+; RV32ZBA-NEXT: sltu a6, a5, a6
+; RV32ZBA-NEXT: sub a5, a5, a0
+; RV32ZBA-NEXT: mulhsu a0, a4, a0
+; RV32ZBA-NEXT: add a2, a2, a6
+; RV32ZBA-NEXT: sltu a3, a5, a3
+; RV32ZBA-NEXT: add a0, a0, a3
+; RV32ZBA-NEXT: srai a3, a2, 31
+; RV32ZBA-NEXT: srai a6, a0, 31
; RV32ZBA-NEXT: add a3, a3, a6
+; RV32ZBA-NEXT: neg a6, a1
+; RV32ZBA-NEXT: mulh a4, a1, a4
+; RV32ZBA-NEXT: srai a5, a5, 31
+; RV32ZBA-NEXT: add a0, a2, a0
+; RV32ZBA-NEXT: sltu a2, a0, a2
+; RV32ZBA-NEXT: sub a0, a0, a1
; RV32ZBA-NEXT: add a2, a3, a2
+; RV32ZBA-NEXT: sltu a1, a0, a6
+; RV32ZBA-NEXT: add a2, a4, a2
; RV32ZBA-NEXT: add a1, a2, a1
-; RV32ZBA-NEXT: xor a1, a1, a0
-; RV32ZBA-NEXT: xor a0, a5, a0
+; RV32ZBA-NEXT: xor a1, a1, a5
+; RV32ZBA-NEXT: xor a0, a0, a5
; RV32ZBA-NEXT: or a0, a0, a1
; RV32ZBA-NEXT: beqz a0, .LBB62_2
; RV32ZBA-NEXT: # %bb.1: # %overflow
@@ -5215,40 +4929,31 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
; RV32ZICOND-NEXT: li a2, -13
; RV32ZICOND-NEXT: neg a3, a0
; RV32ZICOND-NEXT: li a4, -1
-; RV32ZICOND-NEXT: srai a5, a1, 31
-; RV32ZICOND-NEXT: neg a6, a1
-; RV32ZICOND-NEXT: add a7, a0, a1
-; RV32ZICOND-NEXT: mulhu t0, a0, a2
-; RV32ZICOND-NEXT: mul t1, a1, a2
-; RV32ZICOND-NEXT: mulhu t2, a1, a2
-; RV32ZICOND-NEXT: mulhu t3, a0, a4
-; RV32ZICOND-NEXT: mul t4, a5, a2
-; RV32ZICOND-NEXT: mulhu a4, a1, a4
-; RV32ZICOND-NEXT: mulh a2, a5, a2
-; RV32ZICOND-NEXT: add t0, t1, t0
-; RV32ZICOND-NEXT: sub a5, t4, a0
-; RV32ZICOND-NEXT: sub a7, t3, a7
-; RV32ZICOND-NEXT: sltu t1, t0, t1
-; RV32ZICOND-NEXT: sub a0, t0, a0
-; RV32ZICOND-NEXT: sltu t0, a5, a3
-; RV32ZICOND-NEXT: add a2, a7, a2
-; RV32ZICOND-NEXT: add t1, t2, t1
-; RV32ZICOND-NEXT: sltu a3, a0, a3
-; RV32ZICOND-NEXT: add a2, a2, t0
-; RV32ZICOND-NEXT: srai a0, a0, 31
-; RV32ZICOND-NEXT: add a3, t3, a3
-; RV32ZICOND-NEXT: add a3, t1, a3
-; RV32ZICOND-NEXT: sub a1, a3, a1
-; RV32ZICOND-NEXT: sltu a3, a3, t1
-; RV32ZICOND-NEXT: add a5, a1, a5
-; RV32ZICOND-NEXT: sltu a6, a1, a6
-; RV32ZICOND-NEXT: add a3, a4, a3
-; RV32ZICOND-NEXT: sltu a1, a5, a1
+; RV32ZICOND-NEXT: mulhu a5, a0, a2
+; RV32ZICOND-NEXT: mul a6, a1, a2
+; RV32ZICOND-NEXT: mulhsu a2, a1, a2
+; RV32ZICOND-NEXT: add a5, a6, a5
+; RV32ZICOND-NEXT: sltu a6, a5, a6
+; RV32ZICOND-NEXT: sub a5, a5, a0
+; RV32ZICOND-NEXT: mulhsu a0, a4, a0
+; RV32ZICOND-NEXT: add a2, a2, a6
+; RV32ZICOND-NEXT: sltu a3, a5, a3
+; RV32ZICOND-NEXT: add a0, a0, a3
+; RV32ZICOND-NEXT: srai a3, a2, 31
+; RV32ZICOND-NEXT: srai a6, a0, 31
; RV32ZICOND-NEXT: add a3, a3, a6
+; RV32ZICOND-NEXT: neg a6, a1
+; RV32ZICOND-NEXT: mulh a4, a1, a4
+; RV32ZICOND-NEXT: srai a5, a5, 31
+; RV32ZICOND-NEXT: add a0, a2, a0
+; RV32ZICOND-NEXT: sltu a2, a0, a2
+; RV32ZICOND-NEXT: sub a0, a0, a1
; RV32ZICOND-NEXT: add a2, a3, a2
+; RV32ZICOND-NEXT: sltu a1, a0, a6
+; RV32ZICOND-NEXT: add a2, a4, a2
; RV32ZICOND-NEXT: add a1, a2, a1
-; RV32ZICOND-NEXT: xor a1, a1, a0
-; RV32ZICOND-NEXT: xor a0, a5, a0
+; RV32ZICOND-NEXT: xor a1, a1, a5
+; RV32ZICOND-NEXT: xor a0, a0, a5
; RV32ZICOND-NEXT: or a0, a0, a1
; RV32ZICOND-NEXT: beqz a0, .LBB62_2
; RV32ZICOND-NEXT: # %bb.1: # %overflow
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index 4d6f99abc02dc4..07e4c408a3ff04 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -6,154 +6,130 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC-LABEL: muloti_test:
; SPARC: ! %bb.0: ! %start
; SPARC-NEXT: save %sp, -96, %sp
-; SPARC-NEXT: ld [%fp+96], %l1
-; SPARC-NEXT: mov %i3, %g4
-; SPARC-NEXT: mov %i2, %g2
-; SPARC-NEXT: umul %i3, %l1, %i3
+; SPARC-NEXT: ld [%fp+96], %l2
+; SPARC-NEXT: mov %i3, %g2
+; SPARC-NEXT: mov %i2, %g3
+; SPARC-NEXT: umul %i1, %l2, %l0
; SPARC-NEXT: rd %y, %i2
-; SPARC-NEXT: ld [%fp+92], %l2
-; SPARC-NEXT: umul %g2, %l1, %g3
-; SPARC-NEXT: rd %y, %l0
-; SPARC-NEXT: addcc %g3, %i2, %i2
-; SPARC-NEXT: addxcc %l0, 0, %g3
-; SPARC-NEXT: umul %g4, %l2, %l0
+; SPARC-NEXT: ld [%fp+92], %l1
+; SPARC-NEXT: umul %i0, %l2, %i3
+; SPARC-NEXT: rd %y, %g4
+; SPARC-NEXT: addcc %i3, %i2, %i2
+; SPARC-NEXT: addxcc %g4, 0, %i3
+; SPARC-NEXT: umul %i1, %l1, %g4
; SPARC-NEXT: rd %y, %l3
-; SPARC-NEXT: addcc %l0, %i2, %i2
-; SPARC-NEXT: addxcc %l3, 0, %l0
-; SPARC-NEXT: addcc %g3, %l0, %g3
-; SPARC-NEXT: addxcc %g0, 0, %l0
-; SPARC-NEXT: umul %g2, %l2, %l3
-; SPARC-NEXT: rd %y, %l4
-; SPARC-NEXT: addcc %l3, %g3, %g3
-; SPARC-NEXT: umul %i1, %l1, %l3
+; SPARC-NEXT: addcc %g4, %i2, %l4
+; SPARC-NEXT: addxcc %l3, 0, %i2
+; SPARC-NEXT: addcc %i3, %i2, %i2
+; SPARC-NEXT: addxcc %g0, 0, %i3
+; SPARC-NEXT: umul %i0, %l1, %g4
+; SPARC-NEXT: rd %y, %l3
+; SPARC-NEXT: addcc %g4, %i2, %i2
+; SPARC-NEXT: sra %i0, 31, %g4
+; SPARC-NEXT: smul %l1, %g4, %l5
+; SPARC-NEXT: umul %l2, %g4, %l6
+; SPARC-NEXT: rd %y, %l7
+; SPARC-NEXT: addxcc %l3, %i3, %l3
+; SPARC-NEXT: add %l7, %l6, %i3
+; SPARC-NEXT: add %i3, %l5, %l5
+; SPARC-NEXT: addcc %i2, %l6, %l6
+; SPARC-NEXT: umul %g2, %l2, %i3
+; SPARC-NEXT: rd %y, %i2
+; SPARC-NEXT: addxcc %l3, %l5, %l3
+; SPARC-NEXT: umul %g3, %l2, %l2
; SPARC-NEXT: rd %y, %l5
-; SPARC-NEXT: addxcc %l4, %l0, %l0
-; SPARC-NEXT: umul %i0, %l1, %l4
-; SPARC-NEXT: rd %y, %l6
-; SPARC-NEXT: addcc %l4, %l5, %l4
-; SPARC-NEXT: addxcc %l6, 0, %l5
-; SPARC-NEXT: umul %i1, %l2, %l6
+; SPARC-NEXT: addcc %l2, %i2, %i2
+; SPARC-NEXT: addxcc %l5, 0, %l2
+; SPARC-NEXT: umul %g2, %l1, %l5
; SPARC-NEXT: rd %y, %l7
-; SPARC-NEXT: addcc %l6, %l4, %l4
-; SPARC-NEXT: addxcc %l7, 0, %l6
-; SPARC-NEXT: addcc %l5, %l6, %l5
-; SPARC-NEXT: addxcc %g0, 0, %l6
-; SPARC-NEXT: umul %i0, %l2, %l7
-; SPARC-NEXT: rd %y, %o0
-; SPARC-NEXT: addcc %l7, %l5, %l5
-; SPARC-NEXT: addxcc %o0, %l6, %l6
-; SPARC-NEXT: addcc %l3, %g3, %g3
-; SPARC-NEXT: addxcc %l4, %l0, %l0
-; SPARC-NEXT: addxcc %l5, 0, %l3
-; SPARC-NEXT: umul %g4, %i5, %l4
+; SPARC-NEXT: addcc %l5, %i2, %i2
+; SPARC-NEXT: addxcc %l7, 0, %l5
+; SPARC-NEXT: addcc %l2, %l5, %l2
+; SPARC-NEXT: addxcc %g0, 0, %l5
+; SPARC-NEXT: umul %g3, %l1, %l1
+; SPARC-NEXT: rd %y, %l7
+; SPARC-NEXT: addcc %l1, %l2, %l1
+; SPARC-NEXT: addxcc %l7, %l5, %l2
+; SPARC-NEXT: addcc %l0, %l1, %l0
+; SPARC-NEXT: addxcc %l4, %l2, %l1
+; SPARC-NEXT: addxcc %l6, 0, %l2
+; SPARC-NEXT: addxcc %l3, 0, %l3
+; SPARC-NEXT: umul %g2, %i5, %l4
; SPARC-NEXT: rd %y, %l5
-; SPARC-NEXT: addxcc %l6, 0, %l6
-; SPARC-NEXT: umul %g2, %i5, %l7
+; SPARC-NEXT: sra %l3, 31, %l6
+; SPARC-NEXT: umul %g3, %i5, %l7
; SPARC-NEXT: rd %y, %o0
; SPARC-NEXT: addcc %l7, %l5, %l5
; SPARC-NEXT: addxcc %o0, 0, %l7
-; SPARC-NEXT: umul %g4, %i4, %o0
+; SPARC-NEXT: umul %g2, %i4, %o0
; SPARC-NEXT: rd %y, %o1
; SPARC-NEXT: addcc %o0, %l5, %l5
; SPARC-NEXT: addxcc %o1, 0, %o0
; SPARC-NEXT: addcc %l7, %o0, %l7
; SPARC-NEXT: addxcc %g0, 0, %o0
-; SPARC-NEXT: umul %g2, %i4, %o1
-; SPARC-NEXT: rd %y, %o2
-; SPARC-NEXT: addcc %o1, %l7, %l7
-; SPARC-NEXT: addxcc %o2, %o0, %o0
-; SPARC-NEXT: addcc %l4, %g3, %g3
-; SPARC-NEXT: addxcc %l5, %l0, %l0
-; SPARC-NEXT: addxcc %l7, 0, %l4
-; SPARC-NEXT: addxcc %o0, 0, %l5
-; SPARC-NEXT: addcc %l3, %l4, %l3
-; SPARC-NEXT: addxcc %l6, %l5, %l4
-; SPARC-NEXT: addxcc %g0, 0, %l5
-; SPARC-NEXT: umul %i1, %i5, %l6
-; SPARC-NEXT: rd %y, %l7
-; SPARC-NEXT: addxcc %g0, 0, %o0
-; SPARC-NEXT: umul %i0, %i5, %o1
+; SPARC-NEXT: umul %g3, %i4, %o1
; SPARC-NEXT: rd %y, %o2
; SPARC-NEXT: addcc %o1, %l7, %l7
-; SPARC-NEXT: addxcc %o2, 0, %o1
-; SPARC-NEXT: umul %i1, %i4, %o2
-; SPARC-NEXT: rd %y, %o3
-; SPARC-NEXT: addcc %o2, %l7, %l7
-; SPARC-NEXT: addxcc %o3, 0, %o2
-; SPARC-NEXT: addcc %o1, %o2, %o1
-; SPARC-NEXT: addxcc %g0, 0, %o2
-; SPARC-NEXT: umul %i0, %i4, %o3
-; SPARC-NEXT: rd %y, %o4
-; SPARC-NEXT: addcc %o3, %o1, %o1
-; SPARC-NEXT: addxcc %o4, %o2, %o2
-; SPARC-NEXT: addcc %l6, %l3, %l3
-; SPARC-NEXT: addxcc %l7, %l4, %l4
-; SPARC-NEXT: addxcc %o1, %l5, %l5
-; SPARC-NEXT: sra %i0, 31, %l6
-; SPARC-NEXT: smul %l6, %i4, %l7
-; SPARC-NEXT: umul %l6, %i5, %o1
+; SPARC-NEXT: sra %i4, 31, %o1
+; SPARC-NEXT: smul %o1, %g3, %g3
+; SPARC-NEXT: umul %o1, %g2, %g2
; SPARC-NEXT: rd %y, %o3
-; SPARC-NEXT: addxcc %o2, %o0, %i5
-; SPARC-NEXT: umul %l2, %l6, %l2
-; SPARC-NEXT: rd %y, %o0
-; SPARC-NEXT: add %o3, %l7, %l7
-; SPARC-NEXT: umul %l1, %l6, %l1
-; SPARC-NEXT: rd %y, %l6
-; SPARC-NEXT: add %l7, %o1, %l7
-; SPARC-NEXT: add %l6, %l2, %o2
-; SPARC-NEXT: add %o2, %l1, %o2
-; SPARC-NEXT: addcc %l1, %o1, %o1
-; SPARC-NEXT: addxcc %o2, %l7, %l7
-; SPARC-NEXT: addcc %l2, %l6, %o2
-; SPARC-NEXT: addxcc %o0, 0, %o3
-; SPARC-NEXT: addcc %l1, %o2, %o2
-; SPARC-NEXT: addxcc %l6, 0, %l6
-; SPARC-NEXT: addcc %o3, %l6, %l6
-; SPARC-NEXT: addxcc %g0, 0, %o3
-; SPARC-NEXT: addcc %l2, %l6, %l2
-; SPARC-NEXT: addxcc %o0, %o3, %l6
-; SPARC-NEXT: addcc %l2, %o1, %l2
-; SPARC-NEXT: sra %i4, 31, %i4
-; SPARC-NEXT: umul %i4, %g4, %g4
+; SPARC-NEXT: addxcc %o2, %o0, %o0
+; SPARC-NEXT: add %o3, %g3, %g3
+; SPARC-NEXT: add %g3, %g2, %g3
+; SPARC-NEXT: addcc %l7, %g2, %l7
+; SPARC-NEXT: addxcc %o0, %g3, %o0
+; SPARC-NEXT: addcc %l4, %l0, %g2
+; SPARC-NEXT: addxcc %l5, %l1, %g3
+; SPARC-NEXT: addxcc %l7, 0, %l0
+; SPARC-NEXT: addxcc %o0, 0, %l1
+; SPARC-NEXT: sra %l1, 31, %l4
+; SPARC-NEXT: addcc %l2, %l0, %l0
+; SPARC-NEXT: addxcc %l3, %l1, %l1
+; SPARC-NEXT: addxcc %l6, %l4, %l2
+; SPARC-NEXT: smul %i4, %g4, %l3
+; SPARC-NEXT: umul %i5, %g4, %g4
+; SPARC-NEXT: rd %y, %l5
+; SPARC-NEXT: addxcc %l6, %l4, %l4
+; SPARC-NEXT: add %l5, %g4, %l5
+; SPARC-NEXT: smul %o1, %i0, %l6
+; SPARC-NEXT: umul %o1, %i1, %l7
; SPARC-NEXT: rd %y, %o0
-; SPARC-NEXT: addxcc %l6, %l7, %l6
-; SPARC-NEXT: umul %i4, %g2, %g2
+; SPARC-NEXT: add %l5, %l3, %l3
+; SPARC-NEXT: add %o0, %l6, %l5
+; SPARC-NEXT: add %l5, %l7, %l5
+; SPARC-NEXT: addcc %l7, %g4, %g4
+; SPARC-NEXT: umul %i1, %i5, %l6
; SPARC-NEXT: rd %y, %l7
-; SPARC-NEXT: add %o0, %g4, %o1
-; SPARC-NEXT: smul %i0, %i4, %i0
+; SPARC-NEXT: addxcc %l5, %l3, %l3
+; SPARC-NEXT: umul %i0, %i5, %i5
+; SPARC-NEXT: rd %y, %l5
+; SPARC-NEXT: addcc %i5, %l7, %i5
+; SPARC-NEXT: addxcc %l5, 0, %l5
; SPARC-NEXT: umul %i1, %i4, %i1
+; SPARC-NEXT: rd %y, %l7
+; SPARC-NEXT: addcc %i1, %i5, %i1
+; SPARC-NEXT: addxcc %l7, 0, %i5
+; SPARC-NEXT: addcc %l5, %i5, %i5
+; SPARC-NEXT: addxcc %g0, 0, %l5
+; SPARC-NEXT: umul %i0, %i4, %i0
; SPARC-NEXT: rd %y, %i4
-; SPARC-NEXT: add %o1, %g2, %o1
-; SPARC-NEXT: add %i4, %i1, %i4
-; SPARC-NEXT: add %i4, %i0, %i0
-; SPARC-NEXT: addcc %i1, %g4, %i1
-; SPARC-NEXT: addxcc %i0, %o1, %i0
-; SPARC-NEXT: addcc %g4, %o0, %i4
-; SPARC-NEXT: addxcc %o0, 0, %o0
-; SPARC-NEXT: addcc %g2, %i4, %i4
-; SPARC-NEXT: addxcc %l7, 0, %o1
-; SPARC-NEXT: addcc %o0, %o1, %o0
-; SPARC-NEXT: addxcc %g0, 0, %o1
-; SPARC-NEXT: addcc %g2, %o0, %g2
-; SPARC-NEXT: addxcc %l7, %o1, %l7
-; SPARC-NEXT: addcc %g2, %i1, %i1
-; SPARC-NEXT: addxcc %l7, %i0, %i0
-; SPARC-NEXT: addcc %g4, %l1, %g2
-; SPARC-NEXT: addxcc %i4, %o2, %i4
-; SPARC-NEXT: addxcc %i1, %l2, %i1
-; SPARC-NEXT: addxcc %i0, %l6, %i0
-; SPARC-NEXT: addcc %l3, %g2, %g2
-; SPARC-NEXT: addxcc %l4, %i4, %i4
-; SPARC-NEXT: addxcc %l5, %i1, %i1
-; SPARC-NEXT: addxcc %i5, %i0, %i0
-; SPARC-NEXT: sra %l0, 31, %i5
-; SPARC-NEXT: xor %i0, %i5, %i0
-; SPARC-NEXT: xor %i4, %i5, %i4
+; SPARC-NEXT: addcc %i0, %i5, %i0
+; SPARC-NEXT: addxcc %i4, %l5, %i4
+; SPARC-NEXT: addcc %i0, %g4, %i0
+; SPARC-NEXT: addxcc %i4, %l3, %i4
+; SPARC-NEXT: addcc %l6, %l0, %i5
+; SPARC-NEXT: addxcc %i1, %l1, %i1
+; SPARC-NEXT: addxcc %i0, %l2, %i0
+; SPARC-NEXT: addxcc %i4, %l4, %i4
+; SPARC-NEXT: sra %g3, 31, %g4
+; SPARC-NEXT: xor %i4, %g4, %i4
+; SPARC-NEXT: xor %i1, %g4, %i1
+; SPARC-NEXT: or %i1, %i4, %i1
+; SPARC-NEXT: xor %i0, %g4, %i0
+; SPARC-NEXT: xor %i5, %g4, %i4
; SPARC-NEXT: or %i4, %i0, %i0
-; SPARC-NEXT: xor %i1, %i5, %i1
-; SPARC-NEXT: xor %g2, %i5, %i4
-; SPARC-NEXT: or %i4, %i1, %i1
-; SPARC-NEXT: or %i1, %i0, %i0
+; SPARC-NEXT: or %i0, %i1, %i0
; SPARC-NEXT: cmp %i0, 0
; SPARC-NEXT: bne .LBB0_2
; SPARC-NEXT: nop
@@ -163,98 +139,78 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
; SPARC-NEXT: .LBB0_2:
; SPARC-NEXT: mov 1, %i4
; SPARC-NEXT: .LBB0_3: ! %start
-; SPARC-NEXT: mov %l0, %i0
+; SPARC-NEXT: mov %g3, %i0
; SPARC-NEXT: ret
-; SPARC-NEXT: restore %g0, %g3, %o1
+; SPARC-NEXT: restore %g0, %g2, %o1
;
; SPARC64-LABEL: muloti_test:
; SPARC64: .register %g2, #scratch
; SPARC64-NEXT: .register %g3, #scratch
; SPARC64-NEXT: ! %bb.0: ! %start
; SPARC64-NEXT: save %sp, -176, %sp
-; SPARC64-NEXT: mov %i3, %i5
-; SPARC64-NEXT: mov %i2, %i3
-; SPARC64-NEXT: mov %i1, %i2
-; SPARC64-NEXT: mov %i0, %i4
-; SPARC64-NEXT: mov %g0, %o0
-; SPARC64-NEXT: mov %i1, %o1
+; SPARC64-NEXT: mov %i3, %i4
+; SPARC64-NEXT: mov %i1, %i5
+; SPARC64-NEXT: mov %i0, %l2
+; SPARC64-NEXT: srax %i0, 63, %i3
+; SPARC64-NEXT: mov %i3, %o0
+; SPARC64-NEXT: mov %i0, %o1
; SPARC64-NEXT: mov %g0, %o2
; SPARC64-NEXT: call __multi3
-; SPARC64-NEXT: mov %i5, %o3
-; SPARC64-NEXT: mov %o0, %i0
-; SPARC64-NEXT: mov %o1, %i1
+; SPARC64-NEXT: mov %i4, %o3
+; SPARC64-NEXT: mov %o0, %l0
+; SPARC64-NEXT: mov %o1, %l1
; SPARC64-NEXT: mov %g0, %o0
-; SPARC64-NEXT: mov %i4, %o1
+; SPARC64-NEXT: mov %i1, %o1
; SPARC64-NEXT: mov %g0, %o2
; SPARC64-NEXT: call __multi3
-; SPARC64-NEXT: mov %i5, %o3
-; SPARC64-NEXT: mov %g0, %g2
-; SPARC64-NEXT: add %o1, %i0, %i0
-; SPARC64-NEXT: cmp %i0, %o1
-; SPARC64-NEXT: movcs %xcc, 1, %g2
-; SPARC64-NEXT: srl %g2, 0, %g2
-; SPARC64-NEXT: add %o0, %g2, %l0
+; SPARC64-NEXT: mov %i4, %o3
+; SPARC64-NEXT: mov %o1, %i1
+; SPARC64-NEXT: mov %g0, %i0
+; SPARC64-NEXT: add %l1, %o0, %l3
+; SPARC64-NEXT: cmp %l3, %l1
+; SPARC64-NEXT: movcs %xcc, 1, %i0
+; SPARC64-NEXT: srl %i0, 0, %i0
+; SPARC64-NEXT: add %l0, %i0, %l0
+; SPARC64-NEXT: srax %l0, 63, %l1
+; SPARC64-NEXT: srax %i2, 63, %i4
; SPARC64-NEXT: mov %g0, %o0
-; SPARC64-NEXT: mov %i2, %o1
-; SPARC64-NEXT: mov %g0, %o2
+; SPARC64-NEXT: mov %i5, %o1
+; SPARC64-NEXT: mov %i4, %o2
; SPARC64-NEXT: call __multi3
-; SPARC64-NEXT: mov %i3, %o3
+; SPARC64-NEXT: mov %i2, %o3
+; SPARC64-NEXT: mov %g0, %i5
; SPARC64-NEXT: mov %g0, %g2
-; SPARC64-NEXT: mov %g0, %g3
-; SPARC64-NEXT: add %o1, %i0, %i0
+; SPARC64-NEXT: add %o1, %l3, %i0
; SPARC64-NEXT: cmp %i0, %o1
+; SPARC64-NEXT: movcs %xcc, 1, %i5
+; SPARC64-NEXT: srl %i5, 0, %i5
+; SPARC64-NEXT: add %o0, %i5, %i5
+; SPARC64-NEXT: srax %i5, 63, %g3
+; SPARC64-NEXT: add %l1, %g3, %g3
+; SPARC64-NEXT: add %l0, %i5, %i5
+; SPARC64-NEXT: cmp %i5, %l0
; SPARC64-NEXT: movcs %xcc, 1, %g2
; SPARC64-NEXT: srl %g2, 0, %g2
-; SPARC64-NEXT: add %o0, %g2, %g2
-; SPARC64-NEXT: add %l0, %g2, %l1
-; SPARC64-NEXT: cmp %l1, %l0
-; SPARC64-NEXT: movcs %xcc, 1, %g3
-; SPARC64-NEXT: srl %g3, 0, %l0
-; SPARC64-NEXT: mov %g0, %o0
-; SPARC64-NEXT: mov %i4, %o1
-; SPARC64-NEXT: mov %g0, %o2
-; SPARC64-NEXT: call __multi3
-; SPARC64-NEXT: mov %i3, %o3
-; SPARC64-NEXT: mov %g0, %g2
-; SPARC64-NEXT: add %o0, %l0, %g3
-; SPARC64-NEXT: add %o1, %l1, %l1
-; SPARC64-NEXT: cmp %l1, %o1
-; SPARC64-NEXT: movcs %xcc, 1, %g2
-; SPARC64-NEXT: srl %g2, 0, %g2
-; SPARC64-NEXT: add %g3, %g2, %l2
-; SPARC64-NEXT: srax %i4, 63, %o2
+; SPARC64-NEXT: add %g3, %g2, %l0
; SPARC64-NEXT: mov %i3, %o0
-; SPARC64-NEXT: mov %i5, %o1
-; SPARC64-NEXT: call __multi3
-; SPARC64-NEXT: mov %o2, %o3
-; SPARC64-NEXT: mov %o0, %i5
-; SPARC64-NEXT: mov %o1, %l0
-; SPARC64-NEXT: srax %i3, 63, %o0
-; SPARC64-NEXT: mov %o0, %o1
+; SPARC64-NEXT: mov %l2, %o1
; SPARC64-NEXT: mov %i4, %o2
; SPARC64-NEXT: call __multi3
; SPARC64-NEXT: mov %i2, %o3
; SPARC64-NEXT: mov %g0, %i2
; SPARC64-NEXT: mov %g0, %i3
-; SPARC64-NEXT: mov %g0, %i4
-; SPARC64-NEXT: add %o0, %i5, %i5
-; SPARC64-NEXT: add %o1, %l0, %g2
-; SPARC64-NEXT: cmp %g2, %o1
+; SPARC64-NEXT: add %o0, %l0, %i4
+; SPARC64-NEXT: add %o1, %i5, %i5
+; SPARC64-NEXT: cmp %i5, %o1
; SPARC64-NEXT: movcs %xcc, 1, %i2
; SPARC64-NEXT: srl %i2, 0, %i2
-; SPARC64-NEXT: add %i5, %i2, %i2
-; SPARC64-NEXT: add %l2, %i2, %i2
-; SPARC64-NEXT: add %l1, %g2, %i5
-; SPARC64-NEXT: cmp %i5, %l1
-; SPARC64-NEXT: movcs %xcc, 1, %i3
-; SPARC64-NEXT: srl %i3, 0, %i3
-; SPARC64-NEXT: add %i2, %i3, %i2
-; SPARC64-NEXT: srax %i0, 63, %i3
-; SPARC64-NEXT: xor %i2, %i3, %i2
-; SPARC64-NEXT: xor %i5, %i3, %i3
-; SPARC64-NEXT: or %i3, %i2, %i2
-; SPARC64-NEXT: movrnz %i2, 1, %i4
-; SPARC64-NEXT: srl %i4, 0, %i2
+; SPARC64-NEXT: add %i4, %i2, %i2
+; SPARC64-NEXT: srax %i0, 63, %i4
+; SPARC64-NEXT: xor %i2, %i4, %i2
+; SPARC64-NEXT: xor %i5, %i4, %i4
+; SPARC64-NEXT: or %i4, %i2, %i2
+; SPARC64-NEXT: movrnz %i2, 1, %i3
+; SPARC64-NEXT: srl %i3, 0, %i2
; SPARC64-NEXT: ret
; SPARC64-NEXT: restore
start:
diff --git a/llvm/test/CodeGen/Thumb/smul_fix.ll b/llvm/test/CodeGen/Thumb/smul_fix.ll
index 52f241802b87e3..4a78f83d59ce7c 100644
--- a/llvm/test/CodeGen/Thumb/smul_fix.ll
+++ b/llvm/test/CodeGen/Thumb/smul_fix.ll
@@ -27,73 +27,56 @@ define i64 @func2(i64 %x, i64 %y) {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r5, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: movs r6, #0
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r5, r6
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r7, r2
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: asrs r1, r1, #31
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r0, r2
; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r4, r5
-; ARM-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r6, r1
; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r1, r4
; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
; ARM-NEXT: str r0, [sp] @ 4-byte Spill
-; ARM-NEXT: asrs r2, r5, #31
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r3, r2
+; ARM-NEXT: adcs r6, r4
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: asrs r7, r2, #31
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: asrs r0, r7, #31
-; ARM-NEXT: mov r1, r0
-; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r1
+; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r0, r1, r0
; ARM-NEXT: lsls r0, r0, #30
-; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: lsrs r1, r2, #2
+; ARM-NEXT: lsrs r1, r5, #2
; ARM-NEXT: adds r1, r0, r1
-; ARM-NEXT: lsls r0, r2, #30
-; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: lsls r0, r5, #30
+; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; ARM-NEXT: lsrs r2, r2, #2
; ARM-NEXT: adds r0, r0, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2)
ret i64 %tmp
@@ -161,60 +144,45 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; ARM-NEXT: push {r4, r5, r6, r7, lr}
; ARM-NEXT: .pad #20
; ARM-NEXT: sub sp, #20
-; ARM-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: str r1, [sp] @ 4-byte Spill
-; ARM-NEXT: movs r5, #0
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: asrs r1, r1, #31
; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r5
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r0, r2
; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r7, r5
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r4, r7
-; ARM-NEXT: ldr r7, [sp] @ 4-byte Reload
-; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r5, r0, r4
-; ARM-NEXT: asrs r2, r7, #31
-; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r3, r2
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: asrs r0, r6, #31
-; ARM-NEXT: mov r1, r0
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r4
; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: asrs r7, r2, #31
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r4
; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: adds r1, r5, r0
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r1
+; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: add r2, sp, #8
+; ARM-NEXT: ldm r2, {r0, r1, r2} @ 12-byte Folded Reload
+; ARM-NEXT: mov r3, r7
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r1, r0, r4
+; ARM-NEXT: mov r0, r5
; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 32)
@@ -226,78 +194,59 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: movs r5, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: adds r7, r0, r1
-; ARM-NEXT: adcs r4, r5
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r0
+; ARM-NEXT: asrs r1, r1, #31
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: movs r7, #0
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r7
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r5
-; ARM-NEXT: adds r0, r4, r1
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: mov r7, r5
-; ARM-NEXT: adcs r7, r5
-; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r5, r7
-; ARM-NEXT: asrs r2, r4, #31
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r3, r2
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: mov r7, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r6, r7
; ARM-NEXT: asrs r0, r6, #31
-; ARM-NEXT: mov r1, r0
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: ldr r3, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: asrs r4, r2, #31
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r2
+; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: asrs r7, r1, #31
+; ARM-NEXT: adds r6, r6, r1
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adcs r7, r0
+; ARM-NEXT: add r2, sp, #8
+; ARM-NEXT: ldm r2, {r0, r1, r2} @ 12-byte Folded Reload
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r0, r0, r6
; ARM-NEXT: adcs r1, r7
-; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
-; ARM-NEXT: adcs r1, r5
; ARM-NEXT: lsls r1, r1, #1
; ARM-NEXT: lsrs r2, r0, #31
; ARM-NEXT: adds r1, r1, r2
; ARM-NEXT: lsls r0, r0, #1
-; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: lsrs r2, r2, #31
+; ARM-NEXT: lsrs r2, r5, #31
; ARM-NEXT: adds r0, r0, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 63)
ret i64 %tmp
diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
index 8bc39ea0370a60..24209b45e302df 100644
--- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
@@ -45,75 +45,55 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #36
-; ARM-NEXT: sub sp, #36
-; ARM-NEXT: str r3, [sp, #28] @ 4-byte Spill
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: .pad #28
+; ARM-NEXT: sub sp, #28
+; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: mov r6, r2
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: asrs r1, r1, #31
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
; ARM-NEXT: movs r4, #0
-; ARM-NEXT: mov r5, r0
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
-; ARM-NEXT: adds r7, r0, r1
-; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: mov r0, r5
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: mov r0, r7
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: ldr r5, [sp, #28] @ 4-byte Reload
-; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r7
; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r4
-; ARM-NEXT: adds r0, r6, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r6, r4
-; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: asrs r0, r5, #31
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: asrs r6, r2, #31
; ARM-NEXT: mov r0, r7
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: str r4, [sp, #32] @ 4-byte Spill
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: adcs r5, r6
-; ARM-NEXT: mov r4, r7
-; ARM-NEXT: asrs r2, r7, #31
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: ldr r7, [sp, #28] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r3, r2
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: asrs r0, r7, #31
-; ARM-NEXT: mov r1, r0
; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r3, r4
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: asrs r7, r1, #31
+; ARM-NEXT: adds r5, r5, r1
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adcs r7, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r6
-; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r3, r2, r0
-; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: adds r3, r0, r5
+; ARM-NEXT: adcs r1, r7
; ARM-NEXT: rsbs r2, r1, #0
; ARM-NEXT: adcs r2, r1
; ARM-NEXT: movs r0, #1
@@ -121,66 +101,67 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; ARM-NEXT: mov r5, r0
; ARM-NEXT: bhi .LBB1_2
; ARM-NEXT: @ %bb.1:
-; ARM-NEXT: ldr r5, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: mov r5, r4
; ARM-NEXT: .LBB1_2:
; ARM-NEXT: ands r2, r5
; ARM-NEXT: cmp r1, #0
; ARM-NEXT: mov r5, r0
; ARM-NEXT: bgt .LBB1_4
; ARM-NEXT: @ %bb.3:
-; ARM-NEXT: ldr r5, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: mov r5, r4
; ARM-NEXT: .LBB1_4:
; ARM-NEXT: orrs r5, r2
; ARM-NEXT: lsls r2, r3, #30
-; ARM-NEXT: ldr r6, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: lsrs r4, r6, #2
-; ARM-NEXT: adds r2, r2, r4
-; ARM-NEXT: lsls r4, r6, #30
-; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: lsrs r6, r6, #2
-; ARM-NEXT: adds r7, r4, r6
-; ARM-NEXT: ldr r4, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: ldr r7, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: lsrs r6, r7, #2
+; ARM-NEXT: adds r2, r2, r6
+; ARM-NEXT: str r2, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: lsls r6, r7, #30
+; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: lsrs r7, r2, #2
+; ARM-NEXT: adds r7, r6, r7
; ARM-NEXT: mvns r6, r4
; ARM-NEXT: cmp r5, #0
; ARM-NEXT: beq .LBB1_6
; ARM-NEXT: @ %bb.5:
; ARM-NEXT: ldr r2, .LCPI1_0
+; ARM-NEXT: str r2, [sp, #24] @ 4-byte Spill
; ARM-NEXT: .LBB1_6:
; ARM-NEXT: mov r5, r6
; ARM-NEXT: bne .LBB1_8
; ARM-NEXT: @ %bb.7:
; ARM-NEXT: mov r5, r7
; ARM-NEXT: .LBB1_8:
-; ARM-NEXT: adds r4, r1, #1
-; ARM-NEXT: rsbs r7, r4, #0
-; ARM-NEXT: adcs r7, r4
-; ARM-NEXT: mvns r4, r0
-; ARM-NEXT: cmp r3, r4
+; ARM-NEXT: adds r2, r1, #1
+; ARM-NEXT: rsbs r7, r2, #0
+; ARM-NEXT: adcs r7, r2
+; ARM-NEXT: mvns r2, r0
+; ARM-NEXT: cmp r3, r2
; ARM-NEXT: mov r3, r0
; ARM-NEXT: blo .LBB1_10
; ARM-NEXT: @ %bb.9:
-; ARM-NEXT: ldr r3, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: .LBB1_10:
; ARM-NEXT: ands r7, r3
; ARM-NEXT: cmp r1, r6
; ARM-NEXT: mov r3, r0
; ARM-NEXT: blt .LBB1_12
; ARM-NEXT: @ %bb.11:
-; ARM-NEXT: ldr r3, [sp, #32] @ 4-byte Reload
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: .LBB1_12:
; ARM-NEXT: orrs r3, r7
; ARM-NEXT: lsls r1, r0, #31
; ARM-NEXT: cmp r3, #0
; ARM-NEXT: bne .LBB1_14
; ARM-NEXT: @ %bb.13:
-; ARM-NEXT: str r5, [sp, #32] @ 4-byte Spill
+; ARM-NEXT: mov r4, r5
; ARM-NEXT: .LBB1_14:
; ARM-NEXT: bne .LBB1_16
; ARM-NEXT: @ %bb.15:
-; ARM-NEXT: mov r1, r2
+; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; ARM-NEXT: .LBB1_16:
-; ARM-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
-; ARM-NEXT: add sp, #36
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: add sp, #28
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .p2align 2
; ARM-NEXT: @ %bb.17:
@@ -272,76 +253,60 @@ define i64 @func5(i64 %x, i64 %y) {
; ARM-NEXT: .pad #28
; ARM-NEXT: sub sp, #28
; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r4, r2
-; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
; ARM-NEXT: mov r5, r1
-; ARM-NEXT: movs r7, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r3, r7
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r4, r0
+; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: asrs r1, r1, #31
+; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; ARM-NEXT: movs r6, #0
; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r7
+; ARM-NEXT: mov r3, r6
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r4, r7
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r7
+; ARM-NEXT: adcs r7, r6
+; ARM-NEXT: asrs r0, r7, #31
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r3, r7
+; ARM-NEXT: asrs r4, r2, #31
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r2
; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r7
-; ARM-NEXT: adds r0, r4, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r6, r7
-; ARM-NEXT: adcs r6, r7
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r7
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: asrs r6, r1, #31
+; ARM-NEXT: adds r0, r7, r1
; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: adcs r7, r6
-; ARM-NEXT: asrs r2, r5, #31
; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r6, r4
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r3, r2
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: asrs r0, r6, #31
-; ARM-NEXT: mov r1, r0
-; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: adcs r6, r0
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r2, r2, r0
-; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: adds r2, r0, r2
+; ARM-NEXT: adcs r1, r6
; ARM-NEXT: asrs r0, r3, #31
; ARM-NEXT: eors r1, r0
; ARM-NEXT: eors r2, r0
; ARM-NEXT: orrs r2, r1
-; ARM-NEXT: eors r5, r6
+; ARM-NEXT: eors r5, r7
; ARM-NEXT: asrs r0, r5, #31
; ARM-NEXT: ldr r1, .LCPI4_0
; ARM-NEXT: eors r1, r0
@@ -410,130 +375,112 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: movs r6, #0
-; ARM-NEXT: mov r7, r0
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r4, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r6, r0
+; ARM-NEXT: asrs r1, r1, #31
; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r4, r0, r1
-; ARM-NEXT: adcs r5, r6
-; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r0, r2
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r6
-; ARM-NEXT: adds r0, r5, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r4, r6
-; ARM-NEXT: adcs r4, r6
-; ARM-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r4
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
; ARM-NEXT: adcs r7, r4
-; ARM-NEXT: asrs r2, r5, #31
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: ldr r5, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r3, r2
+; ARM-NEXT: asrs r0, r7, #31
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: asrs r5, r2, #31
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r3, r5
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: asrs r0, r5, #31
-; ARM-NEXT: mov r1, r0
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: asrs r6, r1, #31
+; ARM-NEXT: adds r7, r7, r1
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adcs r6, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
-; ARM-NEXT: adcs r1, r7
+; ARM-NEXT: adds r0, r0, r7
+; ARM-NEXT: adcs r1, r6
; ARM-NEXT: rsbs r5, r1, #0
; ARM-NEXT: adcs r5, r1
; ARM-NEXT: movs r2, #1
-; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
; ARM-NEXT: cmp r0, #0
; ARM-NEXT: mov r3, r2
; ARM-NEXT: bge .LBB6_2
; ARM-NEXT: @ %bb.1:
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: .LBB6_2:
-; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r6, r2
; ARM-NEXT: bmi .LBB6_4
; ARM-NEXT: @ %bb.3:
-; ARM-NEXT: mov r4, r6
+; ARM-NEXT: mov r6, r4
; ARM-NEXT: .LBB6_4:
-; ARM-NEXT: ands r5, r4
+; ARM-NEXT: ands r5, r6
; ARM-NEXT: cmp r1, #0
; ARM-NEXT: mov r7, r2
; ARM-NEXT: bgt .LBB6_6
; ARM-NEXT: @ %bb.5:
-; ARM-NEXT: mov r7, r6
+; ARM-NEXT: mov r7, r4
; ARM-NEXT: .LBB6_6:
; ARM-NEXT: orrs r7, r5
-; ARM-NEXT: mvns r4, r6
+; ARM-NEXT: mvns r6, r4
; ARM-NEXT: cmp r7, #0
; ARM-NEXT: beq .LBB6_8
; ARM-NEXT: @ %bb.7:
; ARM-NEXT: ldr r0, .LCPI6_0
-; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
; ARM-NEXT: .LBB6_8:
-; ARM-NEXT: mov r5, r4
+; ARM-NEXT: mov r5, r6
; ARM-NEXT: bne .LBB6_10
; ARM-NEXT: @ %bb.9:
-; ARM-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: ldr r5, [sp] @ 4-byte Reload
; ARM-NEXT: .LBB6_10:
; ARM-NEXT: adds r0, r1, #1
; ARM-NEXT: rsbs r7, r0, #0
; ARM-NEXT: adcs r7, r0
; ARM-NEXT: ands r7, r3
-; ARM-NEXT: cmp r1, r4
+; ARM-NEXT: cmp r1, r6
; ARM-NEXT: mov r3, r2
; ARM-NEXT: blt .LBB6_12
; ARM-NEXT: @ %bb.11:
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: .LBB6_12:
; ARM-NEXT: orrs r3, r7
; ARM-NEXT: lsls r1, r2, #31
; ARM-NEXT: cmp r3, #0
; ARM-NEXT: bne .LBB6_14
; ARM-NEXT: @ %bb.13:
-; ARM-NEXT: mov r6, r5
+; ARM-NEXT: mov r4, r5
; ARM-NEXT: .LBB6_14:
; ARM-NEXT: bne .LBB6_16
; ARM-NEXT: @ %bb.15:
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; ARM-NEXT: .LBB6_16:
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: mov r0, r4
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .p2align 2
; ARM-NEXT: @ %bb.17:
@@ -548,82 +495,64 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
; ARM-NEXT: mov r5, r2
-; ARM-NEXT: str r2, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: movs r7, #0
+; ARM-NEXT: mov r2, r1
+; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r3, r7
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r7
+; ARM-NEXT: asrs r1, r1, #31
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: movs r4, #0
+; ARM-NEXT: mov r0, r2
; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r7
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r4, r0, r1
-; ARM-NEXT: adcs r5, r7
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r7
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r7
-; ARM-NEXT: adds r0, r5, r1
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: mov r4, r7
-; ARM-NEXT: adcs r4, r7
-; ARM-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r7
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r5
+; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: asrs r2, r5, #31
-; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: ldr r4, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: asrs r0, r7, #31
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: asrs r5, r2, #31
+; ARM-NEXT: mov r0, r6
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r3, r2
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r0
-; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: asrs r0, r4, #31
-; ARM-NEXT: mov r1, r0
-; ARM-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: adds r2, r2, r0
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: asrs r6, r1, #31
+; ARM-NEXT: adds r7, r7, r1
+; ARM-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARM-NEXT: adcs r6, r0
+; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: mov r3, r5
+; ARM-NEXT: bl __aeabi_lmul
+; ARM-NEXT: adds r2, r0, r7
; ARM-NEXT: adcs r1, r6
; ARM-NEXT: lsls r0, r1, #1
; ARM-NEXT: lsrs r3, r2, #31
; ARM-NEXT: adds r0, r0, r3
; ARM-NEXT: lsls r2, r2, #1
-; ARM-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
; ARM-NEXT: lsrs r3, r3, #31
; ARM-NEXT: adds r2, r2, r3
-; ARM-NEXT: mvns r3, r7
-; ARM-NEXT: ldr r4, .LCPI7_1
-; ARM-NEXT: cmp r1, r4
+; ARM-NEXT: mvns r3, r4
+; ARM-NEXT: ldr r5, .LCPI7_1
+; ARM-NEXT: cmp r1, r5
; ARM-NEXT: ble .LBB7_2
; ARM-NEXT: @ %bb.1:
; ARM-NEXT: ldr r0, .LCPI7_0
@@ -634,20 +563,20 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM-NEXT: .LBB7_4:
; ARM-NEXT: movs r2, #1
; ARM-NEXT: lsls r2, r2, #31
-; ARM-NEXT: movs r4, #3
-; ARM-NEXT: lsls r4, r4, #30
-; ARM-NEXT: cmp r1, r4
+; ARM-NEXT: movs r5, #3
+; ARM-NEXT: lsls r5, r5, #30
+; ARM-NEXT: cmp r1, r5
; ARM-NEXT: blt .LBB7_6
; ARM-NEXT: @ %bb.5:
-; ARM-NEXT: mov r7, r3
+; ARM-NEXT: mov r4, r3
; ARM-NEXT: .LBB7_6:
; ARM-NEXT: blt .LBB7_8
; ARM-NEXT: @ %bb.7:
; ARM-NEXT: mov r2, r0
; ARM-NEXT: .LBB7_8:
-; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r0, r4
; ARM-NEXT: mov r1, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .p2align 2
; ARM-NEXT: @ %bb.9:
diff --git a/llvm/test/CodeGen/Thumb/umul_fix.ll b/llvm/test/CodeGen/Thumb/umul_fix.ll
index 7af5775c61d7bf..deab0955d977e9 100644
--- a/llvm/test/CodeGen/Thumb/umul_fix.ll
+++ b/llvm/test/CodeGen/Thumb/umul_fix.ll
@@ -27,71 +27,55 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: movs r6, #0
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
; ARM-NEXT: mov r5, r0
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: movs r7, #0
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r0, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r0, r6
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: mov r4, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r7
; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r7, r4
-; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r6
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r6
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r7, r0, r7
-; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: mov r1, r5
+; ARM-NEXT: mov r1, r7
+; ARM-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r0
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r1, r7
; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r6
+; ARM-NEXT: mov r3, r7
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: adds r0, r7, r0
; ARM-NEXT: lsls r0, r0, #30
-; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; ARM-NEXT: lsrs r1, r2, #2
; ARM-NEXT: adds r1, r0, r1
; ARM-NEXT: lsls r0, r2, #30
-; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
; ARM-NEXT: lsrs r2, r2, #2
; ARM-NEXT: adds r0, r0, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 2)
ret i64 %tmp
@@ -154,63 +138,47 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #20
-; ARM-NEXT: sub sp, #20
-; ARM-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: str r1, [sp] @ 4-byte Spill
-; ARM-NEXT: movs r5, #0
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r6, r2
-; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r7, r5
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: ldr r4, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r7
-; ARM-NEXT: ldr r7, [sp] @ 4-byte Reload
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r6
; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r6, r0, r6
-; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r5, r4
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: adds r1, r6, r0
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: add sp, #20
+; ARM-NEXT: adds r1, r0, r5
+; ARM-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 32)
ret i64 %tmp
@@ -221,79 +189,56 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r4, r1
-; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: movs r5, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: adds r4, r0, r1
-; ARM-NEXT: adcs r7, r5
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r5
-; ARM-NEXT: adds r0, r7, r1
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: mov r4, r5
-; ARM-NEXT: adcs r4, r5
-; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r4, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r4, r6
+; ARM-NEXT: mov r0, r7
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: adds r4, r4, r1
+; ARM-NEXT: mov r7, r6
+; ARM-NEXT: adcs r7, r6
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
-; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: adcs r1, r7
; ARM-NEXT: lsls r1, r1, #1
; ARM-NEXT: lsrs r2, r0, #31
; ARM-NEXT: adds r1, r1, r2
; ARM-NEXT: lsls r0, r0, #1
-; ARM-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
; ARM-NEXT: lsrs r2, r2, #31
; ARM-NEXT: adds r0, r0, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 63)
ret i64 %tmp
@@ -304,71 +249,46 @@ define i64 @func9(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #20
-; ARM-NEXT: sub sp, #20
-; ARM-NEXT: str r3, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
-; ARM-NEXT: mov r4, r1
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r4, r2
+; ARM-NEXT: mov r5, r1
; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: movs r5, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r7, r0
+; ARM-NEXT: movs r6, #0
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: str r1, [sp] @ 4-byte Spill
-; ARM-NEXT: mov r0, r4
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r0, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
+; ARM-NEXT: mov r4, r1
; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r4, r0, r1
-; ARM-NEXT: adcs r7, r5
-; ARM-NEXT: mov r0, r6
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: adcs r1, r5
-; ARM-NEXT: adds r0, r7, r1
-; ARM-NEXT: str r0, [sp] @ 4-byte Spill
-; ARM-NEXT: mov r4, r5
-; ARM-NEXT: adcs r4, r5
-; ARM-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r5, r0, r1
+; ARM-NEXT: adcs r4, r6
; ARM-NEXT: mov r0, r7
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r6
-; ARM-NEXT: mov r3, r5
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r4, r0
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r5
-; ARM-NEXT: mov r3, r5
+; ARM-NEXT: adds r0, r0, r5
+; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: adds r4, r4, r1
+; ARM-NEXT: mov r5, r6
+; ARM-NEXT: adcs r5, r6
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r6
+; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r3, r6
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r4
-; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
-; ARM-NEXT: adcs r1, r6
-; ARM-NEXT: add sp, #20
+; ARM-NEXT: adcs r1, r5
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 64)
ret i64 %tmp
diff --git a/llvm/test/CodeGen/Thumb/umul_fix_sat.ll b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
index a43b22102c24bf..f24fe933e5ad4a 100644
--- a/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/umul_fix_sat.ll
@@ -34,99 +34,72 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: .pad #20
+; ARM-NEXT: sub sp, #20
+; ARM-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: mov r6, r2
+; ARM-NEXT: mov r7, r1
+; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r0
; ARM-NEXT: movs r4, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #12] @ 4-byte Spill
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: str r1, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r5, r4
-; ARM-NEXT: mov r0, r6
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r0, r7
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r7
+; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r2
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r4
-; ARM-NEXT: adds r0, r5, r1
-; ARM-NEXT: str r0, [sp] @ 4-byte Spill
-; ARM-NEXT: mov r6, r4
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: adds r7, r0, r1
; ARM-NEXT: adcs r6, r4
-; ARM-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; ARM-NEXT: mov r0, r5
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp] @ 4-byte Spill
-; ARM-NEXT: adcs r7, r6
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r4
+; ARM-NEXT: ldr r5, [sp, #12] @ 4-byte Reload
+; ARM-NEXT: mov r2, r5
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; ARM-NEXT: adds r0, r0, r7
+; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r6, r6, r1
+; ARM-NEXT: mov r7, r4
+; ARM-NEXT: adcs r7, r4
; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: mov r1, r5
-; ARM-NEXT: mov r2, r4
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r5
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r6
-; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
; ARM-NEXT: adcs r1, r7
-; ARM-NEXT: lsrs r5, r0, #2
-; ARM-NEXT: orrs r5, r1
+; ARM-NEXT: lsrs r6, r0, #2
+; ARM-NEXT: orrs r6, r1
; ARM-NEXT: lsls r0, r0, #30
-; ARM-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
+; ARM-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
; ARM-NEXT: lsrs r1, r3, #2
; ARM-NEXT: adds r2, r0, r1
; ARM-NEXT: lsls r0, r3, #30
-; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; ARM-NEXT: lsrs r1, r1, #2
; ARM-NEXT: adds r3, r0, r1
; ARM-NEXT: mvns r1, r4
-; ARM-NEXT: cmp r5, #0
+; ARM-NEXT: cmp r6, #0
; ARM-NEXT: mov r0, r1
; ARM-NEXT: beq .LBB1_3
; ARM-NEXT: @ %bb.1:
; ARM-NEXT: beq .LBB1_4
; ARM-NEXT: .LBB1_2:
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .LBB1_3:
; ARM-NEXT: mov r0, r3
; ARM-NEXT: bne .LBB1_2
; ARM-NEXT: .LBB1_4:
; ARM-NEXT: mov r1, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #20
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 2)
ret i64 %tmp
@@ -338,72 +311,49 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r0
; ARM-NEXT: movs r4, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
; ARM-NEXT: mov r1, r4
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r5, r0, r1
-; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
; ARM-NEXT: mov r0, r6
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r2, r5
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r4
-; ARM-NEXT: adds r0, r7, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: mov r5, r4
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
; ARM-NEXT: adcs r5, r4
-; ARM-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
; ARM-NEXT: mov r0, r7
; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #8] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r5
-; ARM-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r0
-; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r4
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r5, r5, r1
+; ARM-NEXT: mov r7, r4
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: mov r2, r1
-; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: adcs r2, r1
-; ARM-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; ARM-NEXT: adds r3, r1, r0
-; ARM-NEXT: adcs r2, r6
+; ARM-NEXT: adds r3, r0, r5
+; ARM-NEXT: adcs r2, r7
; ARM-NEXT: mvns r1, r4
; ARM-NEXT: cmp r2, #0
; ARM-NEXT: mov r0, r1
@@ -411,14 +361,14 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; ARM-NEXT: @ %bb.1:
; ARM-NEXT: beq .LBB7_4
; ARM-NEXT: .LBB7_2:
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .LBB7_3:
-; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: ldr r0, [sp] @ 4-byte Reload
; ARM-NEXT: bne .LBB7_2
; ARM-NEXT: .LBB7_4:
; ARM-NEXT: mov r1, r3
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 32)
ret i64 %tmp
@@ -429,76 +379,53 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM: @ %bb.0:
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
; ARM-NEXT: push {r4, r5, r6, r7, lr}
-; ARM-NEXT: .pad #28
-; ARM-NEXT: sub sp, #28
-; ARM-NEXT: str r3, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: mov r7, r2
-; ARM-NEXT: str r2, [sp, #16] @ 4-byte Spill
-; ARM-NEXT: mov r5, r1
-; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: .pad #12
+; ARM-NEXT: sub sp, #12
+; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; ARM-NEXT: mov r5, r2
+; ARM-NEXT: mov r6, r1
+; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; ARM-NEXT: mov r7, r0
; ARM-NEXT: movs r4, #0
-; ARM-NEXT: mov r6, r0
-; ARM-NEXT: str r0, [sp, #12] @ 4-byte Spill
; ARM-NEXT: mov r1, r4
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: mov r0, r5
-; ARM-NEXT: mov r1, r4
-; ARM-NEXT: mov r2, r7
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r7, r1
-; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
-; ARM-NEXT: adds r5, r0, r1
-; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: str r1, [sp] @ 4-byte Spill
; ARM-NEXT: mov r0, r6
; ARM-NEXT: mov r1, r4
-; ARM-NEXT: ldr r6, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r6
+; ARM-NEXT: mov r2, r5
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: str r0, [sp, #20] @ 4-byte Spill
-; ARM-NEXT: adcs r1, r4
-; ARM-NEXT: adds r0, r7, r1
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: mov r5, r4
+; ARM-NEXT: mov r5, r1
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r1
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
; ARM-NEXT: adcs r5, r4
-; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; ARM-NEXT: mov r0, r7
; ARM-NEXT: mov r1, r4
+; ARM-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r6, r1
-; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: adds r0, r0, r1
-; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; ARM-NEXT: adcs r6, r5
-; ARM-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; ARM-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: mov r2, r4
-; ARM-NEXT: mov r3, r4
-; ARM-NEXT: bl __aeabi_lmul
-; ARM-NEXT: mov r5, r0
-; ARM-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; ARM-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
-; ARM-NEXT: mov r1, r7
-; ARM-NEXT: mov r2, r4
+; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
+; ARM-NEXT: adds r0, r0, r2
+; ARM-NEXT: str r0, [sp] @ 4-byte Spill
+; ARM-NEXT: adcs r1, r4
+; ARM-NEXT: adds r5, r5, r1
+; ARM-NEXT: mov r7, r4
+; ARM-NEXT: adcs r7, r4
+; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; ARM-NEXT: mov r1, r4
+; ARM-NEXT: mov r2, r6
; ARM-NEXT: mov r3, r4
; ARM-NEXT: bl __aeabi_lmul
; ARM-NEXT: adds r0, r0, r5
-; ARM-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
-; ARM-NEXT: adcs r1, r2
-; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
-; ARM-NEXT: adds r0, r2, r0
-; ARM-NEXT: adcs r1, r6
+; ARM-NEXT: adcs r1, r7
; ARM-NEXT: lsls r1, r1, #1
; ARM-NEXT: lsrs r5, r0, #31
; ARM-NEXT: adds r2, r1, r5
; ARM-NEXT: lsls r0, r0, #1
-; ARM-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
; ARM-NEXT: lsrs r1, r1, #31
; ARM-NEXT: adds r3, r0, r1
; ARM-NEXT: mvns r1, r4
@@ -508,14 +435,14 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; ARM-NEXT: @ %bb.1:
; ARM-NEXT: beq .LBB8_4
; ARM-NEXT: .LBB8_2:
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
; ARM-NEXT: .LBB8_3:
; ARM-NEXT: mov r0, r3
; ARM-NEXT: bne .LBB8_2
; ARM-NEXT: .LBB8_4:
; ARM-NEXT: mov r1, r2
-; ARM-NEXT: add sp, #28
+; ARM-NEXT: add sp, #12
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
%tmp = call i64 @llvm.umul.fix.sat.i64(i64 %x, i64 %y, i32 63)
ret i64 %tmp
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 8b75c6fb68c78c..e101c702e64097 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -13,62 +13,49 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nou
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: .cfi_offset %r14, -16
-; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: movq %rdi, %r10
-; CHECK-NEXT: movq %rsi, %rdx
-; CHECK-NEXT: sarq $63, %rdx
-; CHECK-NEXT: movq %rcx, %rdi
-; CHECK-NEXT: imulq %rdx, %rdi
-; CHECK-NEXT: movq %r11, %rax
+; CHECK-NEXT: movq %rdx, %r9
+; CHECK-NEXT: movq %rsi, %r8
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: sarq $63, %rbx
+; CHECK-NEXT: imulq %rdx, %rbx
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %rdx
+; CHECK-NEXT: movq %rdx, %r10
+; CHECK-NEXT: movq %rax, %rsi
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: mulq %r9
; CHECK-NEXT: movq %rdx, %r9
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: addq %rax, %r9
-; CHECK-NEXT: addq %rdi, %r9
-; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: sarq $63, %rax
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: imulq %rsi, %r14
-; CHECK-NEXT: mulq %r10
-; CHECK-NEXT: movq %rax, %r8
-; CHECK-NEXT: movq %rdx, %rdi
-; CHECK-NEXT: addq %r14, %rdi
-; CHECK-NEXT: addq %rax, %rdi
-; CHECK-NEXT: addq %rbx, %r8
-; CHECK-NEXT: adcq %r9, %rdi
-; CHECK-NEXT: movq %r10, %rax
-; CHECK-NEXT: mulq %r11
-; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: mulq %r11
-; CHECK-NEXT: movq %rdx, %r11
-; CHECK-NEXT: movq %rax, %r14
-; CHECK-NEXT: addq %rbx, %r14
-; CHECK-NEXT: adcq $0, %r11
-; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: movq %rax, %r11
+; CHECK-NEXT: addq %r10, %r11
+; CHECK-NEXT: adcq %rbx, %r9
+; CHECK-NEXT: movq %r9, %rbx
+; CHECK-NEXT: sarq $63, %rbx
+; CHECK-NEXT: movq %rcx, %r14
+; CHECK-NEXT: sarq $63, %r14
+; CHECK-NEXT: imulq %rdi, %r14
+; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: movq %rdx, %rbx
-; CHECK-NEXT: movq %rax, %r10
-; CHECK-NEXT: addq %r14, %r10
-; CHECK-NEXT: adcq %r11, %rbx
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: movzbl %al, %r11d
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: addq %rbx, %rax
+; CHECK-NEXT: movq %rdx, %r10
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: addq %r11, %rdi
+; CHECK-NEXT: adcq %r14, %r10
+; CHECK-NEXT: movq %r10, %r11
+; CHECK-NEXT: sarq $63, %r11
+; CHECK-NEXT: addq %r9, %r10
+; CHECK-NEXT: adcq %rbx, %r11
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: imulq %rcx
+; CHECK-NEXT: addq %r10, %rax
; CHECK-NEXT: adcq %r11, %rdx
-; CHECK-NEXT: addq %r8, %rax
-; CHECK-NEXT: adcq %rdi, %rdx
-; CHECK-NEXT: movq %r10, %rcx
+; CHECK-NEXT: movq %rdi, %rcx
; CHECK-NEXT: sarq $63, %rcx
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rax, %rcx
; CHECK-NEXT: orq %rdx, %rcx
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %nooverflow
-; CHECK-NEXT: movq %r9, %rax
-; CHECK-NEXT: movq %r10, %rdx
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: movq %rdi, %rdx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index 42904ee0db90c1..df167338268c44 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,609 +191,490 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $188, %esp
+; X86-NEXT: subl $108, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: negl %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: andl $1, %ebp
-; X86-NEXT: negl %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl %edx, %ebx
; X86-NEXT: setb %al
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movzbl %al, %ecx
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: adcl %edi, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl %ecx, %eax
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebp
+; X86-NEXT: adcl %ecx, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl %edi, %ecx
; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edi
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: adcl %eax, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: sarl $31, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb %al
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: adcl %ebp, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: adcl %esi, %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: adcl %ecx, %eax
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl %ebx, %edx
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %ecx
; X86-NEXT: setb %al
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: addl %ebp, %ecx
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %al, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %ebp, %esi
-; X86-NEXT: setb %al
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %ebp, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: addl %edi, %ebx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: setb %al
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: movzbl %al, %ebp
-; X86-NEXT: adcl %esi, %ebp
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: adcl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: setb %bl
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movzbl %bl, %ebx
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ebx, %edi
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: adcl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: movzbl %al, %eax
; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: adcl %edx, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: movl %ebp, %ecx
+; X86-NEXT: adcl %ebp, %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: adcl %edx, %edx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, %edi
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %ebp
-; X86-NEXT: xorl %edx, %eax
-; X86-NEXT: orl %ebp, %eax
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %edx, %ecx
-; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: xorl %edx, %esi
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: sarl $31, %ebp
+; X86-NEXT: xorl %ebp, %ebx
+; X86-NEXT: xorl %ebp, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: xorl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl %ebp, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: xorl %ebp, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %ebp, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: xorl %ebp, %edx
+; X86-NEXT: xorl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %ebp
+; X86-NEXT: orl %eax, %ebp
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andl $1, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: negl %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: xorl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl %eax, %ebx
-; X86-NEXT: xorl %edi, %eax
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %edx, 4(%eax)
@@ -805,7 +686,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X86-NEXT: movl %edx, 12(%eax)
; X86-NEXT: movb %cl, 16(%eax)
; X86-NEXT: setne 32(%eax)
-; X86-NEXT: addl $188, %esp
+; X86-NEXT: addl $108, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -820,178 +701,137 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r9, %r15
-; X64-NEXT: movq %rcx, %r9
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rsi, %r12
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %rsi, %rbx
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; X64-NEXT: andl $1, %r11d
-; X64-NEXT: negq %r11
-; X64-NEXT: andl $1, %r9d
-; X64-NEXT: negq %r9
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT: andl $1, %r13d
+; X64-NEXT: negq %r13
+; X64-NEXT: andl $1, %r14d
+; X64-NEXT: negq %r14
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r11
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rdx, %rdi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %rdx, %r12
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: movq %r14, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: addq %rax, %r12
+; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: setb %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: addq %rax, %r11
; X64-NEXT: adcq %rdx, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movzbl %sil, %r14d
-; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: adcq %rdx, %r14
-; X64-NEXT: movq %r12, %rax
+; X64-NEXT: addq %rdi, %r11
+; X64-NEXT: adcq %r12, %rcx
+; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rdx, %r10
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rbx, %rbp
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: addq %r10, %rbp
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: addq %rbp, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r13, %rsi
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: movzbl %r8b, %edx
-; X64-NEXT: adcq %rdx, %rbx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: adcq %rdi, %rbx
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: addq %r13, %r15
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: addq %r8, %r15
-; X64-NEXT: adcq %r13, %rbp
+; X64-NEXT: adcq %r8, %rsi
; X64-NEXT: setb %al
-; X64-NEXT: addq %rdi, %rbp
-; X64-NEXT: movzbl %al, %r12d
-; X64-NEXT: adcq %rdx, %r12
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: movq %r8, %r10
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %r15
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq %r14, %r12
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: addq %rax, %r8
-; X64-NEXT: adcq %rdx, %rbx
-; X64-NEXT: setb %r14b
-; X64-NEXT: addq %rax, %rbx
-; X64-NEXT: movzbl %r14b, %r14d
-; X64-NEXT: adcq %rdx, %r14
-; X64-NEXT: addq %rax, %rbp
+; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rsi, %r10
+; X64-NEXT: adcq %rbp, %r8
+; X64-NEXT: addq %rdi, %r10
; X64-NEXT: adcq %r12, %r8
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq %rsi, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: addq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: adcq $0, %r11
; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: addq %r12, %rdi
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: setb %al
-; X64-NEXT: addq %rsi, %rcx
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rsi, %rbx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: adcq %rsi, %r15
+; X64-NEXT: setb %sil
+; X64-NEXT: movzbl %sil, %esi
+; X64-NEXT: addq %rax, %r15
; X64-NEXT: adcq %rdx, %rsi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: imulq %r11
-; X64-NEXT: movq %r12, %r11
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: addq %r9, %r15
+; X64-NEXT: adcq %rbx, %rsi
+; X64-NEXT: addq %r9, %r10
+; X64-NEXT: adcq %r8, %rbx
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: addq %r11, %r15
+; X64-NEXT: adcq %rcx, %rsi
+; X64-NEXT: movq %rdi, %r9
+; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: adcq %rax, %rdi
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rax, %r8
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: addq %rax, %rcx
+; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: setb %al
+; X64-NEXT: addq %r8, %r11
+; X64-NEXT: movzbl %al, %r12d
; X64-NEXT: adcq %rdx, %r12
-; X64-NEXT: addq %rcx, %r11
-; X64-NEXT: adcq %rsi, %r12
-; X64-NEXT: movq %r10, %r9
-; X64-NEXT: addq %r13, %r9
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: adcq %rsi, %r13
-; X64-NEXT: setb %r10b
-; X64-NEXT: addq %rcx, %r13
-; X64-NEXT: movzbl %r10b, %ecx
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: addq %rsi, %rax
-; X64-NEXT: adcq %r9, %rdx
-; X64-NEXT: addq %r13, %rax
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %rdi, %r9
-; X64-NEXT: adcq %r11, %rax
+; X64-NEXT: movq %r13, %rax
+; X64-NEXT: imulq %r14
+; X64-NEXT: addq %rax, %rax
+; X64-NEXT: adcq %rdx, %rdx
+; X64-NEXT: addq %r11, %rax
; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq %rbp, %rsi
-; X64-NEXT: adcq %r8, %r9
-; X64-NEXT: adcq %rbx, %rax
-; X64-NEXT: adcq %r14, %rdx
-; X64-NEXT: movq %r15, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: xorq %rcx, %rdx
-; X64-NEXT: xorq %rcx, %r9
-; X64-NEXT: orq %rdx, %r9
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rsi, %rcx
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: orq %r9, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: andl $1, %esi
-; X64-NEXT: movq %rsi, %rdx
-; X64-NEXT: negq %rdx
-; X64-NEXT: xorq %rdx, %r15
-; X64-NEXT: xorq %rax, %rdx
-; X64-NEXT: orq %r15, %rdx
-; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: addq %r8, %r15
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: adcq %r9, %rax
+; X64-NEXT: adcq %rdi, %rdx
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: sarq $63, %rsi
+; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: xorq %rsi, %r15
+; X64-NEXT: orq %rax, %r15
+; X64-NEXT: xorq %rsi, %rdx
+; X64-NEXT: xorq %rcx, %rsi
+; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: orq %r15, %rsi
+; X64-NEXT: movl %r10d, %edx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: negq %rcx
+; X64-NEXT: xorq %rcx, %rbx
+; X64-NEXT: xorq %r10, %rcx
+; X64-NEXT: orq %rbx, %rcx
+; X64-NEXT: orq %rsi, %rcx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; X64-NEXT: movq %rcx, 8(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
-; X64-NEXT: movb %sil, 16(%rax)
+; X64-NEXT: movb %dl, 16(%rax)
; X64-NEXT: setne 32(%rax)
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 85c966c447fad6..e68b6e328b723b 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -369,76 +369,63 @@ define i64 @func5(i64 %x, i64 %y) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: .cfi_def_cfa_offset 32
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 28
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: imull %edx, %ebx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: imull %ecx, %ebp
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: imull %edi, %esi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %eax, %ebx
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: adcl %esi, %ebp
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull %eax, %esi
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: addl %ebp, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: sarl $31, %ebx
-; X86-NEXT: xorl %ebx, %edx
-; X86-NEXT: xorl %eax, %ebx
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: xorl %ebp, %ecx
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF
-; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: xorl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT: orl %edx, %esi
; X86-NEXT: notl %ecx
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: cmovel %esi, %edi
+; X86-NEXT: cmovel %edi, %ebx
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: addl $8, %esp
; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 15f302355784ce..13596e1b187681 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -14,60 +14,47 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %rbx, -32
; X64-NEXT: .cfi_offset %r14, -24
; X64-NEXT: .cfi_offset %r15, -16
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rdi, %r11
-; X64-NEXT: movq %rsi, %rdx
-; X64-NEXT: sarq $63, %rdx
-; X64-NEXT: movq %rcx, %r9
-; X64-NEXT: imulq %rdx, %r9
-; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rsi, %r9
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: imulq %rdx, %r14
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rdx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rax, %rdi
-; X64-NEXT: addq %r9, %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: sarq $63, %rax
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: imulq %rsi, %r15
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: addq %r15, %r9
-; X64-NEXT: addq %rax, %r9
-; X64-NEXT: addq %r14, %r10
-; X64-NEXT: adcq %rdi, %r9
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r14, %r15
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r15, %r11
-; X64-NEXT: adcq %rbx, %r14
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ebx
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: mulq %r10
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: adcq %r14, %r10
+; X64-NEXT: movq %r10, %r14
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: sarq $63, %r15
+; X64-NEXT: imulq %rdi, %r15
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %rbx, %rdi
+; X64-NEXT: adcq %r15, %r11
+; X64-NEXT: movq %r11, %rbx
+; X64-NEXT: sarq $63, %rbx
+; X64-NEXT: addq %r10, %r11
+; X64-NEXT: adcq %r14, %rbx
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: imulq %rcx
+; X64-NEXT: addq %r11, %rax
; X64-NEXT: adcq %rbx, %rdx
-; X64-NEXT: addq %r10, %rax
-; X64-NEXT: adcq %r9, %rdx
-; X64-NEXT: movq %r11, 8(%r8)
-; X64-NEXT: sarq $63, %r11
-; X64-NEXT: xorq %r11, %rdx
-; X64-NEXT: xorq %rax, %r11
-; X64-NEXT: orq %rdx, %r11
+; X64-NEXT: movq %rdi, 8(%r8)
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: xorq %rdi, %rdx
+; X64-NEXT: xorq %rax, %rdi
+; X64-NEXT: orq %rdx, %rdi
; X64-NEXT: setne %al
-; X64-NEXT: movq %rdi, (%r8)
+; X64-NEXT: movq %rsi, (%r8)
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r14
; X64-NEXT: popq %r15
@@ -83,221 +70,207 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $60, %esp
-; X86-NEXT: .cfi_def_cfa_offset 80
+; X86-NEXT: subl $44, %esp
+; X86-NEXT: .cfi_def_cfa_offset 64
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %ebx, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %ebx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: addl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %bl, %edi
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ecx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: imull %esi, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: imull %ecx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %eax, %esi
; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: imull %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %ebp
-; X86-NEXT: orl %eax, %ebp
-; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: xorl %ecx, %edx
; X86-NEXT: xorl %ebx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -305,7 +278,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 8(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $60, %esp
+; X86-NEXT: addl $44, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -339,218 +312,192 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %rcx, %r13
-; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: movq %r8, %r12
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: movq %rdi, %r11
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: addq %rsi, %r14
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r14, %rbx
-; X64-NEXT: adcq %rsi, %r12
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %r14, %r13
+; X64-NEXT: adcq %rcx, %rsi
; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %r9d
-; X64-NEXT: movq %r13, %rax
+; X64-NEXT: movzbl %al, %ecx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %rsi, %r14
+; X64-NEXT: adcq %rcx, %r8
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: sarq $63, %rcx
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: imulq %rcx, %rsi
+; X64-NEXT: movq %r12, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rcx, %r14
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r12, %rsi
-; X64-NEXT: adcq %r9, %rcx
+; X64-NEXT: movq %rdx, %r15
+; X64-NEXT: addq %rax, %r15
+; X64-NEXT: addq %rsi, %r15
+; X64-NEXT: addq %rax, %r14
+; X64-NEXT: adcq %r8, %r15
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r9, %r13
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r15
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: addq %r13, %rax
+; X64-NEXT: mulq %r12
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %r12
+; X64-NEXT: addq %rsi, %r12
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: addq %r12, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r12, %r9
+; X64-NEXT: adcq %rdi, %rbx
; X64-NEXT: setb %dil
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r9, %rbp
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: addq %rbx, %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: adcq %rax, %rbp
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %rbp
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq %r15, %r12
+; X64-NEXT: sarq $63, %r12
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rdi, %r9
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: adcq %rbx, %r13
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r11, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rdi, %r11
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: addq %r9, %rax
; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: adcq %r13, %rdi
+; X64-NEXT: setb %r8b
; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, %r14
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %rdi
+; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r13
+; X64-NEXT: addq %rdi, %r13
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: adcq %rax, %r9
+; X64-NEXT: movq %r11, %rdi
+; X64-NEXT: movq %r11, %r8
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: imulq %rdi, %r10
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r9, %r10
+; X64-NEXT: addq %r10, %r11
+; X64-NEXT: addq %rax, %r11
+; X64-NEXT: addq %rax, %r13
+; X64-NEXT: adcq %r9, %r11
+; X64-NEXT: addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
+; X64-NEXT: adcq %rbp, %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq $0, %r13
; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq %r11, %rbp
+; X64-NEXT: sarq $63, %rbp
+; X64-NEXT: addq %r14, %r13
+; X64-NEXT: adcq %r15, %r11
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: adcq %rbp, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT: adcq %r12, %rbp
+; X64-NEXT: movq %r8, %rbx
+; X64-NEXT: imulq %rcx, %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: adcq %r11, %rdi
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %rsi
; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rdi, %r9
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r11
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r13, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: addq %rsi, %r9
-; X64-NEXT: adcq %rcx, %r11
-; X64-NEXT: setb %bl
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %rax, %rsi
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
+; X64-NEXT: imulq %r12, %rcx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r12
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r13
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: addq %rax, %r10
; X64-NEXT: addq %r9, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r11, %rdi
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %rbp, %rdi
-; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: movq %rsi, %rbx
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r10, %r9
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: imulq %r12, %r8
+; X64-NEXT: adcq %rsi, %r10
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq {{[0-9]+}}(%rsp)
-; X64-NEXT: addq %r8, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: adcq %r11, %rdx
-; X64-NEXT: addq %r14, %r9
-; X64-NEXT: movzbl %sil, %esi
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: addq %rax, %r9
-; X64-NEXT: adcq %rdx, %rsi
-; X64-NEXT: sarq $63, %r12
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rdx, %r14
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
+; X64-NEXT: mulq %r15
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rdi, %r15
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rax, %r14
-; X64-NEXT: adcq %rdx, %rdi
-; X64-NEXT: setb %bl
-; X64-NEXT: imulq %r12, %rbp
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
-; X64-NEXT: mulq %r12
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: addq %rbp, %rdx
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: adcq %r14, %rdx
-; X64-NEXT: addq %r10, %rdi
-; X64-NEXT: movzbl %bl, %r10d
-; X64-NEXT: adcq %r8, %r10
-; X64-NEXT: addq %rax, %rdi
-; X64-NEXT: adcq %rdx, %r10
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: adcq %r11, %r14
-; X64-NEXT: adcq %r9, %rdi
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
-; X64-NEXT: adcq %r15, %rdi
-; X64-NEXT: adcq %r13, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: sarq $63, %rax
-; X64-NEXT: xorq %rax, %r10
-; X64-NEXT: xorq %rax, %r14
-; X64-NEXT: orq %r10, %r14
-; X64-NEXT: xorq %rax, %rdi
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r15, %rdi
+; X64-NEXT: adcq %r9, %r8
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rbx
+; X64-NEXT: addq %r8, %rax
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %r10, %rdx
+; X64-NEXT: addq %r13, %rsi
+; X64-NEXT: adcq %r11, %rdi
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; X64-NEXT: adcq %rbp, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
+; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: sarq $63, %rcx
; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: orq %rdi, %rax
-; X64-NEXT: orq %r14, %rax
+; X64-NEXT: xorq %rcx, %rsi
+; X64-NEXT: orq %rax, %rsi
+; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: xorq %rdi, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: orq %rsi, %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %rdx, 24(%rax)
+; X64-NEXT: movq %r8, 24(%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
; X64-NEXT: movq %rcx, (%rax)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
@@ -576,193 +523,273 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: subl $156, %esp
-; X86-NEXT: .cfi_def_cfa_offset 176
+; X86-NEXT: subl $128, %esp
+; X86-NEXT: .cfi_def_cfa_offset 148
; X86-NEXT: .cfi_offset %esi, -20
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl %ebp, %edx
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ecx, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edi, %ebp
+; X86-NEXT: sarl $31, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %edx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl %ebp, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %edx, %ebp
; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebp, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
+; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ecx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
@@ -775,238 +802,261 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %eax
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
-; X86-NEXT: adcl %edi, %eax
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
; X86-NEXT: addl %esi, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: setb %bl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: adcl %ebx, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %eax, %ebp
+; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edi, %ebx
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: setb %bl
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: movl (%esp), %edi ## 4-byte Reload
-; X86-NEXT: adcl %esi, %edi
+; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %ebp, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %esi, %ebp
+; X86-NEXT: sarl $31, %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %eax, %edi
+; X86-NEXT: adcl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull %ebp, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl %eax, %ecx
+; X86-NEXT: adcl %edx, %ebp
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl %edx, (%esp) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, %eax
; X86-NEXT: adcl $0, %edi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, %edx
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %ebp, %edx
+; X86-NEXT: sarl $31, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
+; X86-NEXT: adcl %ecx, %ebp
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
@@ -1024,6 +1074,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %esi, %eax
@@ -1033,46 +1084,46 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edi, %ebp
; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: adcl %edi, %esi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebp
-; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %eax, %edi
+; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1082,15 +1133,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %esi, %ebp
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1099,349 +1150,121 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: adcl %ebp, %ebx
+; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT: adcl %edi, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
; X86-NEXT: adcl %eax, %esi
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: sarl $31, %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %ebp, %esi
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl %ebx, %ebp
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl %ebp, %edi
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: setb %al
-; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %ebp
+; X86-NEXT: adcl %ebp, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: imull %edi, %ebp
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebp, %edx
; X86-NEXT: addl %eax, %edx
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: addl %eax, %ebx
; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: adcl %ebx, %eax
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movzbl (%esp), %ebp ## 1-byte Folded Reload
-; X86-NEXT: adcl %edx, %ebp
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: adcl %ebp, %eax
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: adcl %edx, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT: adcl %edi, (%esp) ## 4-byte Folded Spill
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: addl %ebx, %ecx
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: setb %al
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: addl %eax, %ebp
+; X86-NEXT: adcl %edx, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %ebp
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: adcl %edi, %ebp
-; X86-NEXT: addl %esi, %ebx
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl %ebp, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: imull %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: imull %ebp, %ebx
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebx, %edx
; X86-NEXT: addl %eax, %edx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl %ebp, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: addl %eax, %ecx
-; X86-NEXT: adcl %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT: addl %eax, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT: movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT: adcl %ebx, %ecx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: xorl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: orl %esi, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT: movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: xorl %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: xorl %eax, %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: xorl %eax, %edi
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %ebp, 28(%eax)
+; X86-NEXT: movl %esi, 28(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1457,7 +1280,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
; X86-NEXT: movl %ecx, 24(%eax)
; X86-NEXT: setne %al
-; X86-NEXT: addl $156, %esp
+; X86-NEXT: addl $128, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 49cb7c707a14f3..a54ff67f747550 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3291,124 +3291,98 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: movq %r8, %r15
-; SSE2-NEXT: movq %rdx, %r8
+; SSE2-NEXT: movq %r9, %r10
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: movq %rdx, %rcx
; SSE2-NEXT: movq %rsi, %r11
-; SSE2-NEXT: movq %rdi, %r10
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: movq %rsi, %rdx
-; SSE2-NEXT: sarq $63, %rdx
-; SSE2-NEXT: movq %r9, %rbx
-; SSE2-NEXT: imulq %rdx, %rbx
-; SSE2-NEXT: movq %r15, %rax
-; SSE2-NEXT: mulq %rdx
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSE2-NEXT: movq %rsi, %rbp
+; SSE2-NEXT: sarq $63, %rbp
+; SSE2-NEXT: imulq %r8, %rbp
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: mulq %r8
; SSE2-NEXT: movq %rdx, %rsi
-; SSE2-NEXT: movq %rax, %r12
-; SSE2-NEXT: addq %rax, %rsi
-; SSE2-NEXT: addq %rbx, %rsi
-; SSE2-NEXT: movq %r9, %rax
-; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movq %rax, %r13
-; SSE2-NEXT: imulq %r11, %r13
-; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT: movq %r11, %rax
+; SSE2-NEXT: mulq %r8
+; SSE2-NEXT: movq %rdx, %r8
; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: movq %rdx, %rbx
-; SSE2-NEXT: addq %r13, %rbx
-; SSE2-NEXT: addq %rax, %rbx
-; SSE2-NEXT: addq %r12, %r14
-; SSE2-NEXT: adcq %rsi, %rbx
-; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rdx, %r12
+; SSE2-NEXT: addq %rsi, %r14
+; SSE2-NEXT: adcq %rbp, %r8
+; SSE2-NEXT: movq %r8, %rbp
+; SSE2-NEXT: sarq $63, %rbp
+; SSE2-NEXT: sarq $63, %r9
+; SSE2-NEXT: imulq %rdi, %r9
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: mulq %r10
+; SSE2-NEXT: movq %rdx, %rdi
; SSE2-NEXT: movq %rax, %rsi
+; SSE2-NEXT: addq %r14, %rsi
+; SSE2-NEXT: adcq %r9, %rdi
+; SSE2-NEXT: movq %rdi, %r9
+; SSE2-NEXT: sarq $63, %r9
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %rbp, %r9
; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r15
-; SSE2-NEXT: movq %rdx, %r15
-; SSE2-NEXT: movq %rax, %r13
-; SSE2-NEXT: addq %r12, %r13
-; SSE2-NEXT: adcq $0, %r15
-; SSE2-NEXT: movq %r10, %rax
-; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: movq %rdx, %r12
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: addq %r13, %r10
-; SSE2-NEXT: adcq %r15, %r12
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %r15d
-; SSE2-NEXT: movq %r11, %rax
-; SSE2-NEXT: mulq %r9
-; SSE2-NEXT: addq %r12, %rax
-; SSE2-NEXT: adcq %r15, %rdx
-; SSE2-NEXT: addq %r14, %rax
-; SSE2-NEXT: adcq %rbx, %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT: movq %r10, 8(%r12)
+; SSE2-NEXT: imulq %r10
+; SSE2-NEXT: addq %rdi, %rax
+; SSE2-NEXT: adcq %r9, %rdx
+; SSE2-NEXT: movq %rsi, 8(%r15)
+; SSE2-NEXT: sarq $63, %rsi
+; SSE2-NEXT: xorq %rsi, %rdx
+; SSE2-NEXT: xorq %rax, %rsi
+; SSE2-NEXT: xorl %r11d, %r11d
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: setne %r11b
+; SSE2-NEXT: movq %rbx, %r10
; SSE2-NEXT: sarq $63, %r10
-; SSE2-NEXT: xorq %r10, %rdx
-; SSE2-NEXT: xorq %rax, %r10
-; SSE2-NEXT: xorl %r15d, %r15d
-; SSE2-NEXT: orq %rdx, %r10
-; SSE2-NEXT: setne %r15b
-; SSE2-NEXT: movq %rcx, %rdx
-; SSE2-NEXT: sarq $63, %rdx
-; SSE2-NEXT: movq %rbp, %r10
-; SSE2-NEXT: imulq %rdx, %r10
-; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: mulq %rdx
-; SSE2-NEXT: movq %rdx, %r9
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: addq %rax, %r9
-; SSE2-NEXT: addq %r10, %r9
-; SSE2-NEXT: movq %rbp, %rax
-; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: imulq %rcx, %r14
-; SSE2-NEXT: mulq %r8
-; SSE2-NEXT: movq %rax, %r11
-; SSE2-NEXT: movq %rdx, %r10
-; SSE2-NEXT: addq %r14, %r10
-; SSE2-NEXT: addq %rax, %r10
-; SSE2-NEXT: addq %rbx, %r11
-; SSE2-NEXT: adcq %r9, %r10
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %rdi
-; SSE2-NEXT: movq %rdx, %rbx
-; SSE2-NEXT: movq %rax, %r9
+; SSE2-NEXT: imulq %r13, %r10
; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rdi
+; SSE2-NEXT: mulq %r13
; SSE2-NEXT: movq %rdx, %rdi
-; SSE2-NEXT: movq %rax, %r14
-; SSE2-NEXT: addq %rbx, %r14
-; SSE2-NEXT: adcq $0, %rdi
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: mulq %rbp
+; SSE2-NEXT: movq %rax, %rsi
+; SSE2-NEXT: movq %rbx, %rax
+; SSE2-NEXT: mulq %r13
; SSE2-NEXT: movq %rdx, %r8
-; SSE2-NEXT: movq %rax, %rbx
-; SSE2-NEXT: addq %r14, %rbx
-; SSE2-NEXT: adcq %rdi, %r8
-; SSE2-NEXT: setb %al
-; SSE2-NEXT: movzbl %al, %edi
+; SSE2-NEXT: movq %rax, %r9
+; SSE2-NEXT: addq %rdi, %r9
+; SSE2-NEXT: adcq %r10, %r8
+; SSE2-NEXT: movq %r8, %r14
+; SSE2-NEXT: sarq $63, %r14
+; SSE2-NEXT: movq %r12, %r13
+; SSE2-NEXT: sarq $63, %r13
+; SSE2-NEXT: imulq %rcx, %r13
; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rbp
-; SSE2-NEXT: addq %r8, %rax
-; SSE2-NEXT: adcq %rdi, %rdx
-; SSE2-NEXT: addq %r11, %rax
-; SSE2-NEXT: adcq %r10, %rdx
-; SSE2-NEXT: movq %rbx, 24(%r12)
-; SSE2-NEXT: sarq $63, %rbx
-; SSE2-NEXT: xorq %rbx, %rdx
-; SSE2-NEXT: xorq %rax, %rbx
+; SSE2-NEXT: mulq %r12
+; SSE2-NEXT: movq %rdx, %rdi
+; SSE2-NEXT: movq %rax, %r10
+; SSE2-NEXT: addq %r9, %r10
+; SSE2-NEXT: adcq %r13, %rdi
+; SSE2-NEXT: movq %rdi, %rcx
+; SSE2-NEXT: sarq $63, %rcx
+; SSE2-NEXT: addq %r8, %rdi
+; SSE2-NEXT: adcq %r14, %rcx
+; SSE2-NEXT: movq %rbx, %rax
+; SSE2-NEXT: imulq %r12
+; SSE2-NEXT: addq %rdi, %rax
+; SSE2-NEXT: adcq %rcx, %rdx
+; SSE2-NEXT: movq %r10, 24(%r15)
+; SSE2-NEXT: sarq $63, %r10
+; SSE2-NEXT: xorq %r10, %rdx
+; SSE2-NEXT: xorq %rax, %r10
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rdx, %rbx
+; SSE2-NEXT: orq %rdx, %r10
; SSE2-NEXT: setne %al
; SSE2-NEXT: negl %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: negl %r15d
-; SSE2-NEXT: movd %r15d, %xmm0
+; SSE2-NEXT: negl %r11d
+; SSE2-NEXT: movd %r11d, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq %r9, 16(%r12)
-; SSE2-NEXT: movq %rsi, (%r12)
+; SSE2-NEXT: movq %rsi, 16(%r15)
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movq %rax, (%r15)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -3425,124 +3399,98 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSSE3-NEXT: pushq %r13
; SSSE3-NEXT: pushq %r12
; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movq %r8, %r15
-; SSSE3-NEXT: movq %rdx, %r8
+; SSSE3-NEXT: movq %r9, %r10
+; SSSE3-NEXT: movq %rcx, %rbx
+; SSSE3-NEXT: movq %rdx, %rcx
; SSSE3-NEXT: movq %rsi, %r11
-; SSSE3-NEXT: movq %rdi, %r10
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSSE3-NEXT: movq %rsi, %rdx
-; SSSE3-NEXT: sarq $63, %rdx
-; SSSE3-NEXT: movq %r9, %rbx
-; SSSE3-NEXT: imulq %rdx, %rbx
-; SSSE3-NEXT: movq %r15, %rax
-; SSSE3-NEXT: mulq %rdx
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSSE3-NEXT: movq %rsi, %rbp
+; SSSE3-NEXT: sarq $63, %rbp
+; SSSE3-NEXT: imulq %r8, %rbp
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: mulq %r8
; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: movq %rax, %r12
-; SSSE3-NEXT: addq %rax, %rsi
-; SSSE3-NEXT: addq %rbx, %rsi
-; SSSE3-NEXT: movq %r9, %rax
-; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movq %rax, %r13
-; SSSE3-NEXT: imulq %r11, %r13
-; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSSE3-NEXT: movq %r11, %rax
+; SSSE3-NEXT: mulq %r8
+; SSSE3-NEXT: movq %rdx, %r8
; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: movq %rdx, %rbx
-; SSSE3-NEXT: addq %r13, %rbx
-; SSSE3-NEXT: addq %rax, %rbx
-; SSSE3-NEXT: addq %r12, %r14
-; SSSE3-NEXT: adcq %rsi, %rbx
-; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rdx, %r12
+; SSSE3-NEXT: addq %rsi, %r14
+; SSSE3-NEXT: adcq %rbp, %r8
+; SSSE3-NEXT: movq %r8, %rbp
+; SSSE3-NEXT: sarq $63, %rbp
+; SSSE3-NEXT: sarq $63, %r9
+; SSSE3-NEXT: imulq %rdi, %r9
+; SSSE3-NEXT: movq %rdi, %rax
+; SSSE3-NEXT: mulq %r10
+; SSSE3-NEXT: movq %rdx, %rdi
; SSSE3-NEXT: movq %rax, %rsi
+; SSSE3-NEXT: addq %r14, %rsi
+; SSSE3-NEXT: adcq %r9, %rdi
+; SSSE3-NEXT: movq %rdi, %r9
+; SSSE3-NEXT: sarq $63, %r9
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %rbp, %r9
; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r15
-; SSSE3-NEXT: movq %rdx, %r15
-; SSSE3-NEXT: movq %rax, %r13
-; SSSE3-NEXT: addq %r12, %r13
-; SSSE3-NEXT: adcq $0, %r15
-; SSSE3-NEXT: movq %r10, %rax
-; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: movq %rdx, %r12
-; SSSE3-NEXT: movq %rax, %r10
-; SSSE3-NEXT: addq %r13, %r10
-; SSSE3-NEXT: adcq %r15, %r12
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %r15d
-; SSSE3-NEXT: movq %r11, %rax
-; SSSE3-NEXT: mulq %r9
-; SSSE3-NEXT: addq %r12, %rax
-; SSSE3-NEXT: adcq %r15, %rdx
-; SSSE3-NEXT: addq %r14, %rax
-; SSSE3-NEXT: adcq %rbx, %rdx
-; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSSE3-NEXT: movq %r10, 8(%r12)
+; SSSE3-NEXT: imulq %r10
+; SSSE3-NEXT: addq %rdi, %rax
+; SSSE3-NEXT: adcq %r9, %rdx
+; SSSE3-NEXT: movq %rsi, 8(%r15)
+; SSSE3-NEXT: sarq $63, %rsi
+; SSSE3-NEXT: xorq %rsi, %rdx
+; SSSE3-NEXT: xorq %rax, %rsi
+; SSSE3-NEXT: xorl %r11d, %r11d
+; SSSE3-NEXT: orq %rdx, %rsi
+; SSSE3-NEXT: setne %r11b
+; SSSE3-NEXT: movq %rbx, %r10
; SSSE3-NEXT: sarq $63, %r10
-; SSSE3-NEXT: xorq %r10, %rdx
-; SSSE3-NEXT: xorq %rax, %r10
-; SSSE3-NEXT: xorl %r15d, %r15d
-; SSSE3-NEXT: orq %rdx, %r10
-; SSSE3-NEXT: setne %r15b
-; SSSE3-NEXT: movq %rcx, %rdx
-; SSSE3-NEXT: sarq $63, %rdx
-; SSSE3-NEXT: movq %rbp, %r10
-; SSSE3-NEXT: imulq %rdx, %r10
-; SSSE3-NEXT: movq %rdi, %rax
-; SSSE3-NEXT: mulq %rdx
-; SSSE3-NEXT: movq %rdx, %r9
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: addq %rax, %r9
-; SSSE3-NEXT: addq %r10, %r9
-; SSSE3-NEXT: movq %rbp, %rax
-; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: imulq %rcx, %r14
-; SSSE3-NEXT: mulq %r8
-; SSSE3-NEXT: movq %rax, %r11
-; SSSE3-NEXT: movq %rdx, %r10
-; SSSE3-NEXT: addq %r14, %r10
-; SSSE3-NEXT: addq %rax, %r10
-; SSSE3-NEXT: addq %rbx, %r11
-; SSSE3-NEXT: adcq %r9, %r10
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %rdi
-; SSSE3-NEXT: movq %rdx, %rbx
-; SSSE3-NEXT: movq %rax, %r9
+; SSSE3-NEXT: imulq %r13, %r10
; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: mulq %rdi
+; SSSE3-NEXT: mulq %r13
; SSSE3-NEXT: movq %rdx, %rdi
-; SSSE3-NEXT: movq %rax, %r14
-; SSSE3-NEXT: addq %rbx, %r14
-; SSSE3-NEXT: adcq $0, %rdi
-; SSSE3-NEXT: movq %r8, %rax
-; SSSE3-NEXT: mulq %rbp
+; SSSE3-NEXT: movq %rax, %rsi
+; SSSE3-NEXT: movq %rbx, %rax
+; SSSE3-NEXT: mulq %r13
; SSSE3-NEXT: movq %rdx, %r8
-; SSSE3-NEXT: movq %rax, %rbx
-; SSSE3-NEXT: addq %r14, %rbx
-; SSSE3-NEXT: adcq %rdi, %r8
-; SSSE3-NEXT: setb %al
-; SSSE3-NEXT: movzbl %al, %edi
+; SSSE3-NEXT: movq %rax, %r9
+; SSSE3-NEXT: addq %rdi, %r9
+; SSSE3-NEXT: adcq %r10, %r8
+; SSSE3-NEXT: movq %r8, %r14
+; SSSE3-NEXT: sarq $63, %r14
+; SSSE3-NEXT: movq %r12, %r13
+; SSSE3-NEXT: sarq $63, %r13
+; SSSE3-NEXT: imulq %rcx, %r13
; SSSE3-NEXT: movq %rcx, %rax
-; SSSE3-NEXT: mulq %rbp
-; SSSE3-NEXT: addq %r8, %rax
-; SSSE3-NEXT: adcq %rdi, %rdx
-; SSSE3-NEXT: addq %r11, %rax
-; SSSE3-NEXT: adcq %r10, %rdx
-; SSSE3-NEXT: movq %rbx, 24(%r12)
-; SSSE3-NEXT: sarq $63, %rbx
-; SSSE3-NEXT: xorq %rbx, %rdx
-; SSSE3-NEXT: xorq %rax, %rbx
+; SSSE3-NEXT: mulq %r12
+; SSSE3-NEXT: movq %rdx, %rdi
+; SSSE3-NEXT: movq %rax, %r10
+; SSSE3-NEXT: addq %r9, %r10
+; SSSE3-NEXT: adcq %r13, %rdi
+; SSSE3-NEXT: movq %rdi, %rcx
+; SSSE3-NEXT: sarq $63, %rcx
+; SSSE3-NEXT: addq %r8, %rdi
+; SSSE3-NEXT: adcq %r14, %rcx
+; SSSE3-NEXT: movq %rbx, %rax
+; SSSE3-NEXT: imulq %r12
+; SSSE3-NEXT: addq %rdi, %rax
+; SSSE3-NEXT: adcq %rcx, %rdx
+; SSSE3-NEXT: movq %r10, 24(%r15)
+; SSSE3-NEXT: sarq $63, %r10
+; SSSE3-NEXT: xorq %r10, %rdx
+; SSSE3-NEXT: xorq %rax, %r10
; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: orq %rdx, %rbx
+; SSSE3-NEXT: orq %rdx, %r10
; SSSE3-NEXT: setne %al
; SSSE3-NEXT: negl %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: negl %r15d
-; SSSE3-NEXT: movd %r15d, %xmm0
+; SSSE3-NEXT: negl %r11d
+; SSSE3-NEXT: movd %r11d, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %r9, 16(%r12)
-; SSSE3-NEXT: movq %rsi, (%r12)
+; SSSE3-NEXT: movq %rsi, 16(%r15)
+; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSSE3-NEXT: movq %rax, (%r15)
; SSSE3-NEXT: popq %rbx
; SSSE3-NEXT: popq %r12
; SSSE3-NEXT: popq %r13
@@ -3559,123 +3507,97 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; SSE41-NEXT: pushq %r13
; SSE41-NEXT: pushq %r12
; SSE41-NEXT: pushq %rbx
-; SSE41-NEXT: movq %r8, %r15
-; SSE41-NEXT: movq %rdx, %r8
+; SSE41-NEXT: movq %r9, %r10
+; SSE41-NEXT: movq %rcx, %rbx
+; SSE41-NEXT: movq %rdx, %rcx
; SSE41-NEXT: movq %rsi, %r11
-; SSE41-NEXT: movq %rdi, %r10
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT: movq %rsi, %rdx
-; SSE41-NEXT: sarq $63, %rdx
-; SSE41-NEXT: movq %r9, %rbx
-; SSE41-NEXT: imulq %rdx, %rbx
-; SSE41-NEXT: movq %r15, %rax
-; SSE41-NEXT: mulq %rdx
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSE41-NEXT: movq %rsi, %rbp
+; SSE41-NEXT: sarq $63, %rbp
+; SSE41-NEXT: imulq %r8, %rbp
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: mulq %r8
; SSE41-NEXT: movq %rdx, %rsi
-; SSE41-NEXT: movq %rax, %r12
-; SSE41-NEXT: addq %rax, %rsi
-; SSE41-NEXT: addq %rbx, %rsi
-; SSE41-NEXT: movq %r9, %rax
-; SSE41-NEXT: sarq $63, %rax
-; SSE41-NEXT: movq %rax, %r13
-; SSE41-NEXT: imulq %r11, %r13
-; SSE41-NEXT: mulq %r10
+; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE41-NEXT: movq %r11, %rax
+; SSE41-NEXT: mulq %r8
+; SSE41-NEXT: movq %rdx, %r8
; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: movq %rdx, %rbx
-; SSE41-NEXT: addq %r13, %rbx
-; SSE41-NEXT: addq %rax, %rbx
-; SSE41-NEXT: addq %r12, %r14
-; SSE41-NEXT: adcq %rsi, %rbx
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rdx, %r12
+; SSE41-NEXT: addq %rsi, %r14
+; SSE41-NEXT: adcq %rbp, %r8
+; SSE41-NEXT: movq %r8, %rbp
+; SSE41-NEXT: sarq $63, %rbp
+; SSE41-NEXT: sarq $63, %r9
+; SSE41-NEXT: imulq %rdi, %r9
+; SSE41-NEXT: movq %rdi, %rax
+; SSE41-NEXT: mulq %r10
+; SSE41-NEXT: movq %rdx, %rdi
; SSE41-NEXT: movq %rax, %rsi
+; SSE41-NEXT: addq %r14, %rsi
+; SSE41-NEXT: adcq %r9, %rdi
+; SSE41-NEXT: movq %rdi, %r9
+; SSE41-NEXT: sarq $63, %r9
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %rbp, %r9
; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r15
-; SSE41-NEXT: movq %rdx, %r15
-; SSE41-NEXT: movq %rax, %r13
-; SSE41-NEXT: addq %r12, %r13
-; SSE41-NEXT: adcq $0, %r15
-; SSE41-NEXT: movq %r10, %rax
-; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: movq %rdx, %r12
-; SSE41-NEXT: movq %rax, %r10
-; SSE41-NEXT: addq %r13, %r10
-; SSE41-NEXT: adcq %r15, %r12
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %r15d
-; SSE41-NEXT: movq %r11, %rax
-; SSE41-NEXT: mulq %r9
-; SSE41-NEXT: addq %r12, %rax
-; SSE41-NEXT: adcq %r15, %rdx
-; SSE41-NEXT: addq %r14, %rax
-; SSE41-NEXT: adcq %rbx, %rdx
-; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE41-NEXT: movq %r10, 8(%r12)
+; SSE41-NEXT: imulq %r10
+; SSE41-NEXT: addq %rdi, %rax
+; SSE41-NEXT: adcq %r9, %rdx
+; SSE41-NEXT: movq %rsi, 8(%r15)
+; SSE41-NEXT: sarq $63, %rsi
+; SSE41-NEXT: xorq %rsi, %rdx
+; SSE41-NEXT: xorq %rax, %rsi
+; SSE41-NEXT: xorl %r11d, %r11d
+; SSE41-NEXT: orq %rdx, %rsi
+; SSE41-NEXT: setne %r11b
+; SSE41-NEXT: movq %rbx, %r10
; SSE41-NEXT: sarq $63, %r10
-; SSE41-NEXT: xorq %r10, %rdx
-; SSE41-NEXT: xorq %rax, %r10
-; SSE41-NEXT: xorl %r15d, %r15d
-; SSE41-NEXT: orq %rdx, %r10
-; SSE41-NEXT: setne %r15b
-; SSE41-NEXT: movq %rcx, %rdx
-; SSE41-NEXT: sarq $63, %rdx
-; SSE41-NEXT: movq %rbp, %r10
-; SSE41-NEXT: imulq %rdx, %r10
-; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: mulq %rdx
-; SSE41-NEXT: movq %rdx, %r9
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: addq %rax, %r9
-; SSE41-NEXT: addq %r10, %r9
-; SSE41-NEXT: movq %rbp, %rax
-; SSE41-NEXT: sarq $63, %rax
-; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: imulq %rcx, %r14
-; SSE41-NEXT: mulq %r8
-; SSE41-NEXT: movq %rax, %r11
-; SSE41-NEXT: movq %rdx, %r10
-; SSE41-NEXT: addq %r14, %r10
-; SSE41-NEXT: addq %rax, %r10
-; SSE41-NEXT: addq %rbx, %r11
-; SSE41-NEXT: adcq %r9, %r10
-; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: mulq %rdi
-; SSE41-NEXT: movq %rdx, %rbx
-; SSE41-NEXT: movq %rax, %r9
+; SSE41-NEXT: imulq %r13, %r10
; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rdi
+; SSE41-NEXT: mulq %r13
; SSE41-NEXT: movq %rdx, %rdi
-; SSE41-NEXT: movq %rax, %r14
-; SSE41-NEXT: addq %rbx, %r14
-; SSE41-NEXT: adcq $0, %rdi
-; SSE41-NEXT: movq %r8, %rax
-; SSE41-NEXT: mulq %rbp
+; SSE41-NEXT: movq %rax, %rsi
+; SSE41-NEXT: movq %rbx, %rax
+; SSE41-NEXT: mulq %r13
; SSE41-NEXT: movq %rdx, %r8
-; SSE41-NEXT: movq %rax, %rbx
-; SSE41-NEXT: addq %r14, %rbx
-; SSE41-NEXT: adcq %rdi, %r8
-; SSE41-NEXT: setb %al
-; SSE41-NEXT: movzbl %al, %edi
+; SSE41-NEXT: movq %rax, %r9
+; SSE41-NEXT: addq %rdi, %r9
+; SSE41-NEXT: adcq %r10, %r8
+; SSE41-NEXT: movq %r8, %r14
+; SSE41-NEXT: sarq $63, %r14
+; SSE41-NEXT: movq %r12, %r13
+; SSE41-NEXT: sarq $63, %r13
+; SSE41-NEXT: imulq %rcx, %r13
; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rbp
-; SSE41-NEXT: addq %r8, %rax
-; SSE41-NEXT: adcq %rdi, %rdx
-; SSE41-NEXT: addq %r11, %rax
-; SSE41-NEXT: adcq %r10, %rdx
-; SSE41-NEXT: movq %rbx, 24(%r12)
-; SSE41-NEXT: sarq $63, %rbx
-; SSE41-NEXT: xorq %rbx, %rdx
-; SSE41-NEXT: xorq %rax, %rbx
+; SSE41-NEXT: mulq %r12
+; SSE41-NEXT: movq %rdx, %rdi
+; SSE41-NEXT: movq %rax, %r10
+; SSE41-NEXT: addq %r9, %r10
+; SSE41-NEXT: adcq %r13, %rdi
+; SSE41-NEXT: movq %rdi, %rcx
+; SSE41-NEXT: sarq $63, %rcx
+; SSE41-NEXT: addq %r8, %rdi
+; SSE41-NEXT: adcq %r14, %rcx
+; SSE41-NEXT: movq %rbx, %rax
+; SSE41-NEXT: imulq %r12
+; SSE41-NEXT: addq %rdi, %rax
+; SSE41-NEXT: adcq %rcx, %rdx
+; SSE41-NEXT: movq %r10, 24(%r15)
+; SSE41-NEXT: sarq $63, %r10
+; SSE41-NEXT: xorq %r10, %rdx
+; SSE41-NEXT: xorq %rax, %r10
; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: orq %rdx, %rbx
+; SSE41-NEXT: orq %rdx, %r10
; SSE41-NEXT: setne %al
; SSE41-NEXT: negl %eax
-; SSE41-NEXT: negl %r15d
-; SSE41-NEXT: movd %r15d, %xmm0
+; SSE41-NEXT: negl %r11d
+; SSE41-NEXT: movd %r11d, %xmm0
; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: movq %r9, 16(%r12)
-; SSE41-NEXT: movq %rsi, (%r12)
+; SSE41-NEXT: movq %rsi, 16(%r15)
+; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE41-NEXT: movq %rax, (%r15)
; SSE41-NEXT: popq %rbx
; SSE41-NEXT: popq %r12
; SSE41-NEXT: popq %r13
@@ -3692,123 +3614,97 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX-NEXT: pushq %r13
; AVX-NEXT: pushq %r12
; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %r8, %r15
-; AVX-NEXT: movq %rdx, %r8
+; AVX-NEXT: movq %r9, %r10
+; AVX-NEXT: movq %rcx, %rbx
+; AVX-NEXT: movq %rdx, %rcx
; AVX-NEXT: movq %rsi, %r11
-; AVX-NEXT: movq %rdi, %r10
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT: movq %rsi, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: movq %r9, %rbx
-; AVX-NEXT: imulq %rdx, %rbx
-; AVX-NEXT: movq %r15, %rax
-; AVX-NEXT: mulq %rdx
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT: movq %rsi, %rbp
+; AVX-NEXT: sarq $63, %rbp
+; AVX-NEXT: imulq %r8, %rbp
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: mulq %r8
; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: movq %rax, %r12
-; AVX-NEXT: addq %rax, %rsi
-; AVX-NEXT: addq %rbx, %rsi
-; AVX-NEXT: movq %r9, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: movq %rax, %r13
-; AVX-NEXT: imulq %r11, %r13
-; AVX-NEXT: mulq %r10
+; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX-NEXT: movq %r11, %rax
+; AVX-NEXT: mulq %r8
+; AVX-NEXT: movq %rdx, %r8
; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: addq %r13, %rbx
-; AVX-NEXT: addq %rax, %rbx
-; AVX-NEXT: addq %r12, %r14
-; AVX-NEXT: adcq %rsi, %rbx
-; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: addq %rsi, %r14
+; AVX-NEXT: adcq %rbp, %r8
+; AVX-NEXT: movq %r8, %rbp
+; AVX-NEXT: sarq $63, %rbp
+; AVX-NEXT: sarq $63, %r9
+; AVX-NEXT: imulq %rdi, %r9
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: mulq %r10
+; AVX-NEXT: movq %rdx, %rdi
; AVX-NEXT: movq %rax, %rsi
+; AVX-NEXT: addq %r14, %rsi
+; AVX-NEXT: adcq %r9, %rdi
+; AVX-NEXT: movq %rdi, %r9
+; AVX-NEXT: sarq $63, %r9
+; AVX-NEXT: addq %r8, %rdi
+; AVX-NEXT: adcq %rbp, %r9
; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r15
-; AVX-NEXT: movq %rdx, %r15
-; AVX-NEXT: movq %rax, %r13
-; AVX-NEXT: addq %r12, %r13
-; AVX-NEXT: adcq $0, %r15
-; AVX-NEXT: movq %r10, %rax
-; AVX-NEXT: mulq %r9
-; AVX-NEXT: movq %rdx, %r12
-; AVX-NEXT: movq %rax, %r10
-; AVX-NEXT: addq %r13, %r10
-; AVX-NEXT: adcq %r15, %r12
-; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %r15d
-; AVX-NEXT: movq %r11, %rax
-; AVX-NEXT: mulq %r9
-; AVX-NEXT: addq %r12, %rax
-; AVX-NEXT: adcq %r15, %rdx
-; AVX-NEXT: addq %r14, %rax
-; AVX-NEXT: adcq %rbx, %rdx
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX-NEXT: movq %r10, 8(%r12)
+; AVX-NEXT: imulq %r10
+; AVX-NEXT: addq %rdi, %rax
+; AVX-NEXT: adcq %r9, %rdx
+; AVX-NEXT: movq %rsi, 8(%r15)
+; AVX-NEXT: sarq $63, %rsi
+; AVX-NEXT: xorq %rsi, %rdx
+; AVX-NEXT: xorq %rax, %rsi
+; AVX-NEXT: xorl %r11d, %r11d
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: setne %r11b
+; AVX-NEXT: movq %rbx, %r10
; AVX-NEXT: sarq $63, %r10
-; AVX-NEXT: xorq %r10, %rdx
-; AVX-NEXT: xorq %rax, %r10
-; AVX-NEXT: xorl %r15d, %r15d
-; AVX-NEXT: orq %rdx, %r10
-; AVX-NEXT: setne %r15b
-; AVX-NEXT: movq %rcx, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: movq %rbp, %r10
-; AVX-NEXT: imulq %rdx, %r10
-; AVX-NEXT: movq %rdi, %rax
-; AVX-NEXT: mulq %rdx
-; AVX-NEXT: movq %rdx, %r9
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: addq %rax, %r9
-; AVX-NEXT: addq %r10, %r9
-; AVX-NEXT: movq %rbp, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: imulq %rcx, %r14
-; AVX-NEXT: mulq %r8
-; AVX-NEXT: movq %rax, %r11
-; AVX-NEXT: movq %rdx, %r10
-; AVX-NEXT: addq %r14, %r10
-; AVX-NEXT: addq %rax, %r10
-; AVX-NEXT: addq %rbx, %r11
-; AVX-NEXT: adcq %r9, %r10
-; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %rdi
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: imulq %r13, %r10
; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rdi
+; AVX-NEXT: mulq %r13
; AVX-NEXT: movq %rdx, %rdi
-; AVX-NEXT: movq %rax, %r14
-; AVX-NEXT: addq %rbx, %r14
-; AVX-NEXT: adcq $0, %rdi
-; AVX-NEXT: movq %r8, %rax
-; AVX-NEXT: mulq %rbp
+; AVX-NEXT: movq %rax, %rsi
+; AVX-NEXT: movq %rbx, %rax
+; AVX-NEXT: mulq %r13
; AVX-NEXT: movq %rdx, %r8
-; AVX-NEXT: movq %rax, %rbx
-; AVX-NEXT: addq %r14, %rbx
-; AVX-NEXT: adcq %rdi, %r8
-; AVX-NEXT: setb %al
-; AVX-NEXT: movzbl %al, %edi
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: addq %rdi, %r9
+; AVX-NEXT: adcq %r10, %r8
+; AVX-NEXT: movq %r8, %r14
+; AVX-NEXT: sarq $63, %r14
+; AVX-NEXT: movq %r12, %r13
+; AVX-NEXT: sarq $63, %r13
+; AVX-NEXT: imulq %rcx, %r13
; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rbp
-; AVX-NEXT: addq %r8, %rax
-; AVX-NEXT: adcq %rdi, %rdx
-; AVX-NEXT: addq %r11, %rax
-; AVX-NEXT: adcq %r10, %rdx
-; AVX-NEXT: movq %rbx, 24(%r12)
-; AVX-NEXT: sarq $63, %rbx
-; AVX-NEXT: xorq %rbx, %rdx
-; AVX-NEXT: xorq %rax, %rbx
+; AVX-NEXT: mulq %r12
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rax, %r10
+; AVX-NEXT: addq %r9, %r10
+; AVX-NEXT: adcq %r13, %rdi
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: sarq $63, %rcx
+; AVX-NEXT: addq %r8, %rdi
+; AVX-NEXT: adcq %r14, %rcx
+; AVX-NEXT: movq %rbx, %rax
+; AVX-NEXT: imulq %r12
+; AVX-NEXT: addq %rdi, %rax
+; AVX-NEXT: adcq %rcx, %rdx
+; AVX-NEXT: movq %r10, 24(%r15)
+; AVX-NEXT: sarq $63, %r10
+; AVX-NEXT: xorq %r10, %rdx
+; AVX-NEXT: xorq %rax, %r10
; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: orq %rdx, %rbx
+; AVX-NEXT: orq %rdx, %r10
; AVX-NEXT: setne %al
; AVX-NEXT: negl %eax
-; AVX-NEXT: negl %r15d
-; AVX-NEXT: vmovd %r15d, %xmm0
+; AVX-NEXT: negl %r11d
+; AVX-NEXT: vmovd %r11d, %xmm0
; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: movq %r9, 16(%r12)
-; AVX-NEXT: movq %rsi, (%r12)
+; AVX-NEXT: movq %rsi, 16(%r15)
+; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX-NEXT: movq %rax, (%r15)
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r12
; AVX-NEXT: popq %r13
@@ -3825,57 +3721,43 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %r9, %rbp
; AVX512F-NEXT: movq %rcx, %r11
; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rsi, %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: imulq %rcx, %rbx
-; AVX512F-NEXT: movq %r15, %rax
-; AVX512F-NEXT: mulq %rcx
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq %rax, %r12
-; AVX512F-NEXT: addq %rax, %rcx
-; AVX512F-NEXT: addq %rbx, %rcx
-; AVX512F-NEXT: movq %rsi, %rax
-; AVX512F-NEXT: sarq $63, %rax
-; AVX512F-NEXT: movq %rax, %r13
-; AVX512F-NEXT: imulq %r11, %r13
-; AVX512F-NEXT: mulq %r10
-; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: movq %r11, %rbp
+; AVX512F-NEXT: sarq $63, %rbp
+; AVX512F-NEXT: imulq %r14, %rbp
+; AVX512F-NEXT: movq %rdx, %rax
+; AVX512F-NEXT: mulq %r14
; AVX512F-NEXT: movq %rdx, %rbx
-; AVX512F-NEXT: addq %r13, %rbx
-; AVX512F-NEXT: addq %rax, %rbx
-; AVX512F-NEXT: addq %r12, %r14
-; AVX512F-NEXT: adcq %rcx, %rbx
-; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rdx, %r12
-; AVX512F-NEXT: movq %rax, %rcx
+; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %r15
-; AVX512F-NEXT: movq %rdx, %r15
-; AVX512F-NEXT: movq %rax, %r13
-; AVX512F-NEXT: addq %r12, %r13
-; AVX512F-NEXT: adcq $0, %r15
+; AVX512F-NEXT: mulq %r14
+; AVX512F-NEXT: movq %rdx, %r14
+; AVX512F-NEXT: movq %rax, %r15
+; AVX512F-NEXT: addq %rbx, %r15
+; AVX512F-NEXT: adcq %rbp, %r14
+; AVX512F-NEXT: movq %r14, %rbp
+; AVX512F-NEXT: sarq $63, %rbp
+; AVX512F-NEXT: movq %rcx, %r13
+; AVX512F-NEXT: sarq $63, %r13
+; AVX512F-NEXT: imulq %r10, %r13
; AVX512F-NEXT: movq %r10, %rax
-; AVX512F-NEXT: mulq %rsi
-; AVX512F-NEXT: movq %rdx, %r12
+; AVX512F-NEXT: mulq %rcx
+; AVX512F-NEXT: movq %rdx, %rbx
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: addq %r13, %r10
-; AVX512F-NEXT: adcq %r15, %r12
-; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %r15d
+; AVX512F-NEXT: addq %r15, %r10
+; AVX512F-NEXT: adcq %r13, %rbx
+; AVX512F-NEXT: movq %rbx, %r15
+; AVX512F-NEXT: sarq $63, %r15
+; AVX512F-NEXT: addq %r14, %rbx
+; AVX512F-NEXT: adcq %rbp, %r15
; AVX512F-NEXT: movq %r11, %rax
-; AVX512F-NEXT: mulq %rsi
-; AVX512F-NEXT: addq %r12, %rax
+; AVX512F-NEXT: imulq %rcx
+; AVX512F-NEXT: addq %rbx, %rax
; AVX512F-NEXT: adcq %r15, %rdx
-; AVX512F-NEXT: addq %r14, %rax
-; AVX512F-NEXT: adcq %rbx, %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
; AVX512F-NEXT: movq %r10, 24(%r12)
; AVX512F-NEXT: sarq $63, %r10
; AVX512F-NEXT: xorq %r10, %rdx
@@ -3883,56 +3765,43 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: orq %rdx, %r10
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: movq %r9, %rdx
-; AVX512F-NEXT: sarq $63, %rdx
-; AVX512F-NEXT: movq %rbp, %rsi
-; AVX512F-NEXT: imulq %rdx, %rsi
-; AVX512F-NEXT: movq %r8, %rax
-; AVX512F-NEXT: mulq %rdx
-; AVX512F-NEXT: movq %rdx, %r10
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: addq %rax, %r10
-; AVX512F-NEXT: addq %rsi, %r10
-; AVX512F-NEXT: movq %rbp, %rax
-; AVX512F-NEXT: sarq $63, %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: imulq %r9, %rsi
-; AVX512F-NEXT: mulq %rdi
-; AVX512F-NEXT: movq %rax, %rbx
-; AVX512F-NEXT: movq %rdx, %r11
-; AVX512F-NEXT: addq %rsi, %r11
-; AVX512F-NEXT: addq %rax, %r11
-; AVX512F-NEXT: addq %r14, %rbx
-; AVX512F-NEXT: adcq %r10, %r11
+; AVX512F-NEXT: movq %rsi, %rcx
+; AVX512F-NEXT: sarq $63, %rcx
+; AVX512F-NEXT: imulq %r8, %rcx
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: mulq %r8
-; AVX512F-NEXT: movq %rdx, %r14
+; AVX512F-NEXT: movq %rdx, %r11
; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: movq %r9, %rax
+; AVX512F-NEXT: movq %rsi, %rax
; AVX512F-NEXT: mulq %r8
; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rax, %r15
-; AVX512F-NEXT: addq %r14, %r15
-; AVX512F-NEXT: adcq $0, %r8
+; AVX512F-NEXT: movq %rax, %rbx
+; AVX512F-NEXT: addq %r11, %rbx
+; AVX512F-NEXT: adcq %rcx, %r8
+; AVX512F-NEXT: movq %r8, %rcx
+; AVX512F-NEXT: sarq $63, %rcx
+; AVX512F-NEXT: movq %r9, %r14
+; AVX512F-NEXT: sarq $63, %r14
+; AVX512F-NEXT: imulq %rdi, %r14
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: mulq %r9
; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: addq %r15, %r14
-; AVX512F-NEXT: adcq %r8, %rdi
-; AVX512F-NEXT: setb %al
-; AVX512F-NEXT: movzbl %al, %esi
-; AVX512F-NEXT: movq %r9, %rax
-; AVX512F-NEXT: mulq %rbp
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: addq %rbx, %r11
+; AVX512F-NEXT: adcq %r14, %rdi
+; AVX512F-NEXT: movq %rdi, %rbx
+; AVX512F-NEXT: sarq $63, %rbx
+; AVX512F-NEXT: addq %r8, %rdi
+; AVX512F-NEXT: adcq %rcx, %rbx
+; AVX512F-NEXT: movq %rsi, %rax
+; AVX512F-NEXT: imulq %r9
; AVX512F-NEXT: addq %rdi, %rax
-; AVX512F-NEXT: adcq %rsi, %rdx
-; AVX512F-NEXT: addq %rbx, %rax
-; AVX512F-NEXT: adcq %r11, %rdx
-; AVX512F-NEXT: movq %r14, 8(%r12)
-; AVX512F-NEXT: sarq $63, %r14
-; AVX512F-NEXT: xorq %r14, %rdx
-; AVX512F-NEXT: xorq %rax, %r14
-; AVX512F-NEXT: orq %rdx, %r14
+; AVX512F-NEXT: adcq %rbx, %rdx
+; AVX512F-NEXT: movq %r11, 8(%r12)
+; AVX512F-NEXT: sarq $63, %r11
+; AVX512F-NEXT: xorq %r11, %rdx
+; AVX512F-NEXT: xorq %rax, %r11
+; AVX512F-NEXT: orq %rdx, %r11
; AVX512F-NEXT: setne %al
; AVX512F-NEXT: andl $1, %eax
; AVX512F-NEXT: kmovw %eax, %k1
@@ -3940,7 +3809,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512F-NEXT: korw %k0, %k1, %k1
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512F-NEXT: movq %rcx, 16(%r12)
+; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512F-NEXT: movq %rax, 16(%r12)
; AVX512F-NEXT: movq %r10, (%r12)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
@@ -3958,57 +3828,43 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: pushq %r13
; AVX512BW-NEXT: pushq %r12
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: movq %r9, %rbp
; AVX512BW-NEXT: movq %rcx, %r11
; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rsi, %r9
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512BW-NEXT: sarq $63, %rcx
-; AVX512BW-NEXT: movq %rsi, %rbx
-; AVX512BW-NEXT: imulq %rcx, %rbx
-; AVX512BW-NEXT: movq %r15, %rax
-; AVX512BW-NEXT: mulq %rcx
-; AVX512BW-NEXT: movq %rdx, %rcx
-; AVX512BW-NEXT: movq %rax, %r12
-; AVX512BW-NEXT: addq %rax, %rcx
-; AVX512BW-NEXT: addq %rbx, %rcx
-; AVX512BW-NEXT: movq %rsi, %rax
-; AVX512BW-NEXT: sarq $63, %rax
-; AVX512BW-NEXT: movq %rax, %r13
-; AVX512BW-NEXT: imulq %r11, %r13
-; AVX512BW-NEXT: mulq %r10
-; AVX512BW-NEXT: movq %rax, %r14
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512BW-NEXT: movq %r11, %rbp
+; AVX512BW-NEXT: sarq $63, %rbp
+; AVX512BW-NEXT: imulq %r14, %rbp
+; AVX512BW-NEXT: movq %rdx, %rax
+; AVX512BW-NEXT: mulq %r14
; AVX512BW-NEXT: movq %rdx, %rbx
-; AVX512BW-NEXT: addq %r13, %rbx
-; AVX512BW-NEXT: addq %rax, %rbx
-; AVX512BW-NEXT: addq %r12, %r14
-; AVX512BW-NEXT: adcq %rcx, %rbx
-; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rdx, %r12
-; AVX512BW-NEXT: movq %rax, %rcx
+; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %r15
-; AVX512BW-NEXT: movq %rdx, %r15
-; AVX512BW-NEXT: movq %rax, %r13
-; AVX512BW-NEXT: addq %r12, %r13
-; AVX512BW-NEXT: adcq $0, %r15
+; AVX512BW-NEXT: mulq %r14
+; AVX512BW-NEXT: movq %rdx, %r14
+; AVX512BW-NEXT: movq %rax, %r15
+; AVX512BW-NEXT: addq %rbx, %r15
+; AVX512BW-NEXT: adcq %rbp, %r14
+; AVX512BW-NEXT: movq %r14, %rbp
+; AVX512BW-NEXT: sarq $63, %rbp
+; AVX512BW-NEXT: movq %rcx, %r13
+; AVX512BW-NEXT: sarq $63, %r13
+; AVX512BW-NEXT: imulq %r10, %r13
; AVX512BW-NEXT: movq %r10, %rax
-; AVX512BW-NEXT: mulq %rsi
-; AVX512BW-NEXT: movq %rdx, %r12
+; AVX512BW-NEXT: mulq %rcx
+; AVX512BW-NEXT: movq %rdx, %rbx
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: addq %r13, %r10
-; AVX512BW-NEXT: adcq %r15, %r12
-; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %r15d
+; AVX512BW-NEXT: addq %r15, %r10
+; AVX512BW-NEXT: adcq %r13, %rbx
+; AVX512BW-NEXT: movq %rbx, %r15
+; AVX512BW-NEXT: sarq $63, %r15
+; AVX512BW-NEXT: addq %r14, %rbx
+; AVX512BW-NEXT: adcq %rbp, %r15
; AVX512BW-NEXT: movq %r11, %rax
-; AVX512BW-NEXT: mulq %rsi
-; AVX512BW-NEXT: addq %r12, %rax
+; AVX512BW-NEXT: imulq %rcx
+; AVX512BW-NEXT: addq %rbx, %rax
; AVX512BW-NEXT: adcq %r15, %rdx
-; AVX512BW-NEXT: addq %r14, %rax
-; AVX512BW-NEXT: adcq %rbx, %rdx
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12
; AVX512BW-NEXT: movq %r10, 24(%r12)
; AVX512BW-NEXT: sarq $63, %r10
; AVX512BW-NEXT: xorq %r10, %rdx
@@ -4016,56 +3872,43 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: orq %rdx, %r10
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: movq %r9, %rdx
-; AVX512BW-NEXT: sarq $63, %rdx
-; AVX512BW-NEXT: movq %rbp, %rsi
-; AVX512BW-NEXT: imulq %rdx, %rsi
-; AVX512BW-NEXT: movq %r8, %rax
-; AVX512BW-NEXT: mulq %rdx
-; AVX512BW-NEXT: movq %rdx, %r10
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: addq %rax, %r10
-; AVX512BW-NEXT: addq %rsi, %r10
-; AVX512BW-NEXT: movq %rbp, %rax
-; AVX512BW-NEXT: sarq $63, %rax
-; AVX512BW-NEXT: movq %rax, %rsi
-; AVX512BW-NEXT: imulq %r9, %rsi
-; AVX512BW-NEXT: mulq %rdi
-; AVX512BW-NEXT: movq %rax, %rbx
-; AVX512BW-NEXT: movq %rdx, %r11
-; AVX512BW-NEXT: addq %rsi, %r11
-; AVX512BW-NEXT: addq %rax, %r11
-; AVX512BW-NEXT: addq %r14, %rbx
-; AVX512BW-NEXT: adcq %r10, %r11
+; AVX512BW-NEXT: movq %rsi, %rcx
+; AVX512BW-NEXT: sarq $63, %rcx
+; AVX512BW-NEXT: imulq %r8, %rcx
; AVX512BW-NEXT: movq %rdi, %rax
; AVX512BW-NEXT: mulq %r8
-; AVX512BW-NEXT: movq %rdx, %r14
+; AVX512BW-NEXT: movq %rdx, %r11
; AVX512BW-NEXT: movq %rax, %r10
-; AVX512BW-NEXT: movq %r9, %rax
+; AVX512BW-NEXT: movq %rsi, %rax
; AVX512BW-NEXT: mulq %r8
; AVX512BW-NEXT: movq %rdx, %r8
-; AVX512BW-NEXT: movq %rax, %r15
-; AVX512BW-NEXT: addq %r14, %r15
-; AVX512BW-NEXT: adcq $0, %r8
+; AVX512BW-NEXT: movq %rax, %rbx
+; AVX512BW-NEXT: addq %r11, %rbx
+; AVX512BW-NEXT: adcq %rcx, %r8
+; AVX512BW-NEXT: movq %r8, %rcx
+; AVX512BW-NEXT: sarq $63, %rcx
+; AVX512BW-NEXT: movq %r9, %r14
+; AVX512BW-NEXT: sarq $63, %r14
+; AVX512BW-NEXT: imulq %rdi, %r14
; AVX512BW-NEXT: movq %rdi, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: mulq %r9
; AVX512BW-NEXT: movq %rdx, %rdi
-; AVX512BW-NEXT: movq %rax, %r14
-; AVX512BW-NEXT: addq %r15, %r14
-; AVX512BW-NEXT: adcq %r8, %rdi
-; AVX512BW-NEXT: setb %al
-; AVX512BW-NEXT: movzbl %al, %esi
-; AVX512BW-NEXT: movq %r9, %rax
-; AVX512BW-NEXT: mulq %rbp
+; AVX512BW-NEXT: movq %rax, %r11
+; AVX512BW-NEXT: addq %rbx, %r11
+; AVX512BW-NEXT: adcq %r14, %rdi
+; AVX512BW-NEXT: movq %rdi, %rbx
+; AVX512BW-NEXT: sarq $63, %rbx
+; AVX512BW-NEXT: addq %r8, %rdi
+; AVX512BW-NEXT: adcq %rcx, %rbx
+; AVX512BW-NEXT: movq %rsi, %rax
+; AVX512BW-NEXT: imulq %r9
; AVX512BW-NEXT: addq %rdi, %rax
-; AVX512BW-NEXT: adcq %rsi, %rdx
-; AVX512BW-NEXT: addq %rbx, %rax
-; AVX512BW-NEXT: adcq %r11, %rdx
-; AVX512BW-NEXT: movq %r14, 8(%r12)
-; AVX512BW-NEXT: sarq $63, %r14
-; AVX512BW-NEXT: xorq %r14, %rdx
-; AVX512BW-NEXT: xorq %rax, %r14
-; AVX512BW-NEXT: orq %rdx, %r14
+; AVX512BW-NEXT: adcq %rbx, %rdx
+; AVX512BW-NEXT: movq %r11, 8(%r12)
+; AVX512BW-NEXT: sarq $63, %r11
+; AVX512BW-NEXT: xorq %r11, %rdx
+; AVX512BW-NEXT: xorq %rax, %r11
+; AVX512BW-NEXT: orq %rdx, %r11
; AVX512BW-NEXT: setne %al
; AVX512BW-NEXT: andl $1, %eax
; AVX512BW-NEXT: kmovw %eax, %k1
@@ -4073,7 +3916,8 @@ define <2 x i32> @smulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT: movq %rcx, 16(%r12)
+; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512BW-NEXT: movq %rax, 16(%r12)
; AVX512BW-NEXT: movq %r10, (%r12)
; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2169b39b9dfa05..a076d0d762aa3b 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -210,63 +210,49 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $8, %esp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: movl %edi, %esi
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: imull %ebx, %esi
+; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull %ebx, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %ebp, %eax
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %ebp, %ecx
-; WIN32-NEXT: adcl $0, %edi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, %ebp
; WIN32-NEXT: addl %ecx, %ebp
-; WIN32-NEXT: adcl %edi, %ebx
-; WIN32-NEXT: setb %cl
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %ecx, %esi
+; WIN32-NEXT: sarl $31, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: imull %eax, %esi
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl %ecx, %ebp
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: adcl %edi, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: imull {{[0-9]+}}(%esp)
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT: movl %esi, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
+; WIN32-NEXT: movl %esi, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
@@ -567,66 +553,54 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %ebx, %esi
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: imull %edi, %esi
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: adcl %esi, %ebx
; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: addl %edi, %ebx
-; WIN32-NEXT: addl %eax, %ebx
-; WIN32-NEXT: addl %ecx, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %esi, %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
-; WIN32-NEXT: adcl $0, %ebp
-; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ecx, %esi
-; WIN32-NEXT: adcl %ebp, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: setb %cl
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %edi, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %ebx, %edx
+; WIN32-NEXT: movl %ecx, %esi
; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: xorl %esi, %edx
-; WIN32-NEXT: xorl %eax, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: orl %edx, %esi
+; WIN32-NEXT: imull %eax, %esi
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: addl %ebp, %edi
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl %ecx, %ebp
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: imull %ebx
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: xorl %edi, %edx
+; WIN32-NEXT: xorl %eax, %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: orl %edx, %edi
; WIN32-NEXT: jne LBB12_2
; WIN32-NEXT: # %bb.1:
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %ebx, %esi
; WIN32-NEXT: LBB12_2:
-; WIN32-NEXT: movl %ebp, %edx
+; WIN32-NEXT: movl %esi, %edx
; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
@@ -984,59 +958,46 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %ebp, %ecx
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %edi, %esi
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
+; WIN32-NEXT: imull %edi, %ecx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull %edi
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %ebx, %ebp
+; WIN32-NEXT: adcl %ecx, %edi
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull %ebp, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %ebx, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: sarl $31, %ecx
+; WIN32-NEXT: imull %esi, %ecx
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %edx
; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: addl %ebp, %esi
+; WIN32-NEXT: adcl %ecx, %ebx
+; WIN32-NEXT: movl %ebx, %ebp
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: addl %edi, %ebx
+; WIN32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %ebx, %ecx
-; WIN32-NEXT: adcl $0, %edi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: adcl %edi, %ebp
-; WIN32-NEXT: setb %cl
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %ebp, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: sarl $31, %ebx
-; WIN32-NEXT: xorl %ebx, %edx
-; WIN32-NEXT: xorl %eax, %ebx
-; WIN32-NEXT: orl %edx, %ebx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp)
+; WIN32-NEXT: addl %ebx, %eax
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: xorl %esi, %edx
+; WIN32-NEXT: xorl %eax, %esi
+; WIN32-NEXT: orl %edx, %esi
; WIN32-NEXT: jne LBB18_1
; WIN32-NEXT: # %bb.3: # %continue
; WIN32-NEXT: movb $1, %al
@@ -1679,73 +1640,57 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $20, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: subl $12, %esp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %ebx
-; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl (%eax), %ecx
; WIN32-NEXT: movl 4(%eax), %ebp
+; WIN32-NEXT: movl %ebp, %esi
+; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: imull %ebx, %esi
; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl %ecx, %edi
-; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: imull %ebp, %ecx
; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: movl %ebp, %ecx
-; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %edi, %esi
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: addl %eax, %edi
-; WIN32-NEXT: addl %esi, %edi
-; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ebx, %edi
-; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %esi
+; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: addl %edi, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: sarl $31, %edi
; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT: adcl $0, %ebx
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %esi, %ebp
-; WIN32-NEXT: adcl %ebx, %ecx
-; WIN32-NEXT: setb %bl
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl %ecx, %ebp
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: adcl %edi, %ebp
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: imull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %ecx, %eax
-; WIN32-NEXT: movzbl %bl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %edi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT: movl %esi, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
+; WIN32-NEXT: movl %esi, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
-; WIN32-NEXT: addl $20, %esp
+; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
@@ -1789,63 +1734,52 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: subl $12, %esp
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %ebp
-; WIN32-NEXT: movl 4(%eax), %eax
-; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: imull %ecx, %esi
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl (%ecx), %ebx
+; WIN32-NEXT: movl %edi, %esi
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: imull %ebx, %esi
+; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
-; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: sarl $31, %eax
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %ebx, %eax
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: adcl %ecx, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %ebx, %ecx
-; WIN32-NEXT: adcl $0, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %ebx
; WIN32-NEXT: movl %eax, %ebp
; WIN32-NEXT: addl %ecx, %ebp
-; WIN32-NEXT: adcl %edi, %ebx
-; WIN32-NEXT: setb %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload
-; WIN32-NEXT: addl %ebx, %eax
-; WIN32-NEXT: movzbl %cl, %ecx
-; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %esi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: movl 4(%eax), %ecx
+; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ebx
+; WIN32-NEXT: movl %ebx, %edi
+; WIN32-NEXT: sarl $31, %edi
+; WIN32-NEXT: movl %ecx, %esi
+; WIN32-NEXT: sarl $31, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: imull %eax, %esi
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %ecx
+; WIN32-NEXT: addl %ebp, %eax
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %esi, %ecx
+; WIN32-NEXT: movl %ecx, %ebp
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: addl %ebx, %ecx
+; WIN32-NEXT: adcl %edi, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: imull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; WIN32-NEXT: movl %esi, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
+; WIN32-NEXT: movl %esi, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
More information about the llvm-commits
mailing list