[llvm] [CGP]: Optimize mul.overflow. (PR #148343)

Sat Jul 12 00:03:44 PDT 2025

https://github.com/hassnaaHamdi created https://github.com/llvm/llvm-project/pull/148343

- Detect cases where LHS & RHS values will not cause overflow (when the Hi parts are zero).
- Detect cases where either of LHS or RHS values could not cause overflow (when one of the Hi parts is zero).

>From 208fec87b71a76162494a1daf78b54a4ac2d7a35 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi at arm.com>
Date: Sat, 5 Jul 2025 05:57:55 +0000
Subject: [PATCH] [CGP]: Optimize mul.overflow.

- Detect cases where LHS & RHS values will not cause overflow
(when the Hi parts are zero).
- Detect cases where either of LHS or RHS values could not cause overflow
(when one of the Hi parts is zero).
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  573 +++
 llvm/test/CodeGen/AArch64/i128-math.ll        |  504 +-
 .../CodeGen/AArch64/i128_with_overflow.ll     |  198 +-
 .../umulo-128-legalisation-lowering.ll        |  205 +-
 .../ARM/umulo-128-legalisation-lowering.ll    |  579 ++-
 .../ARM/umulo-64-legalisation-lowering.ll     |  107 +-
 .../CodeGen/LoongArch/smul-with-overflow.ll   |  985 +++-
 .../umulo-128-legalisation-lowering.ll        |  439 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |  355 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              | 2893 ++++++++++--
 .../SPARC/smulo-128-legalisation-lowering.ll  | 1255 ++++-
 .../SPARC/umulo-128-legalisation-lowering.ll  |  605 ++-
 .../Thumb/umulo-128-legalisation-lowering.ll  |  654 ++-
 .../Thumb2/umulo-128-legalisation-lowering.ll |  294 +-
 .../Thumb2/umulo-64-legalisation-lowering.ll  |   51 +-
 llvm/test/CodeGen/X86/muloti.ll               |  177 +-
 .../X86/smulo-128-legalisation-lowering.ll    | 4105 +++++++++++++----
 .../X86/umulo-128-legalisation-lowering.ll    |  454 +-
 .../X86/umulo-64-legalisation-lowering.ll     |   85 +-
 llvm/test/CodeGen/X86/xmulo.ll                | 1625 +++++--
 20 files changed, 13143 insertions(+), 3000 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9bbb89e37865d..d9859ed246604 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -431,6 +431,8 @@ class CodeGenPrepare {
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+  bool optimizeUMulWithOverflow(Instruction *I);
+  bool optimizeSMulWithOverflow(Instruction *I);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
   bool optimizeExt(Instruction *&I);
@@ -2769,6 +2771,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    case Intrinsic::umul_with_overflow:
+      return optimizeUMulWithOverflow(II);
+    case Intrinsic::smul_with_overflow:
+      return optimizeSMulWithOverflow(II);
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -6386,6 +6392,573 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
+// Rewrite the umul_with_overflow intrinsic by checking if any/both of the
+// operands' value range is within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
+  if (TLI->getTypeAction(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
+      TargetLowering::TypeExpandInteger)
+    return false;
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  auto *Ty = LHS->getType();
+  unsigned VTBitWidth = Ty->getScalarSizeInBits();
+  unsigned VTHalfBitWidth = VTBitWidth / 2;
+  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
+
+  assert(
+      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
+       TargetLowering::TypeLegal) &&
+      "Expected the type to be legal for the target lowering");
+
+  I->getParent()->setName("overflow.res");
+  auto *OverflowResBB = I->getParent();
+  auto *OverflowoEntryBB =
+      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
+  BasicBlock *OverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowBB = BasicBlock::Create(
+      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
+  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
+                                              I->getFunction(), OverflowResBB);
+  // new blocks should be:
+  //  entry:
+  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
+
+  //  overflow_yes_lhs:
+  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
+
+  //  overflow_no_lhs:
+  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
+
+  //  overflow_no_rhs_only:
+  //  overflow_no_lhs_only:
+  //  overflow_no:
+  //  overflow:
+  //  overflow.res:
+
+  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
+  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
+  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
+  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
+  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
+                                   OverflowResBB->getFirstInsertionPt());
+
+  //------------------------------------------------------------------------------
+  // BB overflow.entry:
+  // get Lo and Hi of RHS & LHS:
+
+  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs.trunc");
+  auto *ShrHiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  auto *HiRHS = BuilderEntryBB.CreateTrunc(ShrHiRHS, LegalTy, "hi.rhs.trunc");
+
+  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs.trunc");
+  auto *ShrHiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  auto *HiLHS = BuilderEntryBB.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
+
+  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+                                       ConstantInt::getNullValue(LegalTy));
+  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
+
+  //------------------------------------------------------------------------------
+  // BB overflow_yes_lhs:
+  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                       ConstantInt::getNullValue(LegalTy));
+  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_lhs:
+  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                         ConstantInt::getNullValue(LegalTy));
+  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_rhs_only:
+  // RHS is 64 value range, LHS is 128
+  // P0 = RHS * LoLHS
+  // P1 = RHS * HiLHS
+
+  LoLHS = BuilderNoOverflowRHSonlyBB.CreateZExt(LoLHS, Ty, "lo.lhs");
+
+  // P0 = (RHS * LoLHS)
+  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, LoLHS,
+                                                  "mul.no.overflow.rhs.lolhs");
+  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
+  auto *P0Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
+  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
+
+  // P1 = (RHS * HiLHS)
+  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, ShrHiLHS,
+                                                  "mul.no.overflow.rhs.hilhs");
+  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
+  auto *P1Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
+  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
+
+  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 0, "rhs.p0.p1.res");
+  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 1, "rhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
+
+  auto *ResLoEx =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(P0Lo, Ty, "rhs.res_lo.zext");
+  auto *ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(AddOResMid, Ty, "rhs.res_mid.zext");
+  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                         "rhs.res_mid.shl");
+  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                       "rhs.res_lo.or.mid");
+  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "rhs.check.overflow");
+
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, FinalRes, {0});
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, IsOverflow, {1});
+  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow_no_lhs_only:
+
+  LoRHS = BuilderNoOverflowLHSonlyBB.CreateZExt(LoRHS, Ty, "lo.rhs");
+
+  // P0 = (LHS * LoRHS)
+  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, LoRHS,
+                                            "mul.no.overflow.lhs.lorhs");
+  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
+  P0Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
+  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
+
+  // P1 = (LHS * HiRHS)
+  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, ShrHiRHS,
+                                            "mul.no.overflow.lhs.hirhs");
+  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
+  P1Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
+  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
+
+  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
+                                                             "lhs.p0.p1.res");
+  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
+                                                        "lhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
+
+  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(P0Lo, Ty, "lhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(AddOResMid, Ty, "lhs.res_mid.zext");
+  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                   "lhs.res_mid.shl");
+  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                 "lhs.res_lo.or.mid");
+  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "lhs.check.overflow");
+
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, FinalRes, {0});
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, IsOverflow, {1});
+
+  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow.no:
+  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflow = PoisonValue::get(STy);
+  StructValNoOverflow =
+      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
+  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+
+  // BB overflow.res:
+  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
+  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
+
+  // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
+  // uses by PHINode.
+  I->replaceAllUsesWith(PHINode);
+
+  // BB overflow:
+  PHINode->addIncoming(I, OverflowBB);
+  I->removeFromParent();
+  I->insertInto(OverflowBB, OverflowBB->end());
+  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+
+  // return false to stop reprocessing the function.
+  return false;
+}
+
+// Rewrite the smul_with_overflow intrinsic by checking if any/both of the
+// operands' value range is within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
+  if (TLI->getTypeAction(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
+      TargetLowering::TypeExpandInteger)
+    return false;
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  auto *Ty = LHS->getType();
+  unsigned VTBitWidth = Ty->getScalarSizeInBits();
+  unsigned VTHalfBitWidth = VTBitWidth / 2;
+  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
+
+  assert(
+      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
+       TargetLowering::TypeLegal) &&
+      "Expected the type to be legal for the target lowering");
+
+  I->getParent()->setName("overflow.res");
+  auto *OverflowResBB = I->getParent();
+  auto *OverflowoEntryBB =
+      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
+  BasicBlock *OverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowBB = BasicBlock::Create(
+      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
+  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
+                                              I->getFunction(), OverflowResBB);
+  // new blocks should be:
+  //  entry:
+  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
+
+  //  overflow_yes_lhs:
+  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
+
+  //  overflow_no_lhs:
+  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
+
+  //  overflow_no_rhs_only:
+  //  overflow_no_lhs_only:
+  //  overflow_no:
+  //  overflow:
+  //  overflow.res:
+
+  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
+  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
+  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
+  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
+  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
+                                   OverflowResBB->getFirstInsertionPt());
+
+  //------------------------------------------------------------------------------
+  // BB overflow.entry:
+  // get Lo and Hi of RHS & LHS:
+
+  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  auto *SignLoRHS =
+      BuilderEntryBB.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+  auto *HiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  HiRHS = BuilderEntryBB.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+
+  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  auto *SignLoLHS =
+      BuilderEntryBB.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+  auto *HiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = BuilderEntryBB.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
+  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
+
+  //------------------------------------------------------------------------------
+  // BB overflow_yes_lhs:
+  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_lhs:
+  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_rhs_only:
+  // RHS is within 64 value range, LHS is 128
+  // P0 = RHS * LoLHS
+  // P1 = RHS * HiLHS
+
+  // check sign of RHS:
+  auto *IsNegRHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
+  auto *AbsRHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.rhs");
+  auto *AbsRHS = BuilderNoOverflowRHSonlyBB.CreateSelect(
+      IsNegRHS, AbsRHSIntr, RHS, "lo.abs.rhs.select");
+
+  // check sign of LHS:
+  auto *IsNegLHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
+  auto *AbsLHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.lhs");
+  auto *AbsLHS = BuilderNoOverflowRHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr,
+                                                         LHS, "abs.lhs.select");
+  LoLHS = BuilderNoOverflowRHSonlyBB.CreateAnd(
+      AbsLHS,
+      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
+      "lo.abs.lhs");
+  HiLHS = BuilderNoOverflowRHSonlyBB.CreateLShr(AbsLHS, VTHalfBitWidth,
+                                                "hi.abs.lhs");
+
+  // P0 = (RHS * LoLHS)
+  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, LoLHS,
+                                                  "mul.no.overflow.rhs.lolhs");
+  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
+  auto *P0Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
+  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
+
+  // P1 = (RHS * HiLHS)
+  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, HiLHS,
+                                                  "mul.no.overflow.rhs.hilhs");
+  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
+  auto *P1Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
+  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
+
+  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 0, "rhs.p0.p1.res");
+  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 1, "rhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
+
+  // sign handling:
+  auto *IsNeg = BuilderNoOverflowRHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
+  auto *Mask =
+      BuilderNoOverflowRHSonlyBB.CreateSExt(IsNeg, LegalTy, "rhs.sign.mask");
+  auto *Add_1 =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(IsNeg, LegalTy, "rhs.add.1");
+  auto *ResLo =
+      BuilderNoOverflowRHSonlyBB.CreateXor(P0Lo, Mask, "rhs.res_lo.xor.mask");
+  ResLo =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResLo, Add_1, "rhs.res_lo.add.1");
+
+  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
+                                               "rhs.check.res_lo.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResMid = BuilderNoOverflowRHSonlyBB.CreateXor(AddOResMid, Mask,
+                                                      "rhs.res_mid.xor.mask");
+  ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResMid, Carry, "rhs.res_mid.carry");
+
+  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
+                                               Carry, "rhs.check.reslo.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateXor(ResHi, Mask, "rhs.res_hi.xor.mask");
+  ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResHi, Carry, "rhs.res_hi.carry");
+  // set the final result:
+  auto *ResLoEx =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(ResLo, Ty, "rhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(ResMid, Ty, "rhs.res_mid.zext");
+  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                         "rhs.res_mid.shl");
+  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                       "rhs.res_lo.or.mid");
+  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "rhs.check.overflow");
+
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, FinalRes, {0});
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, IsOverflow, {1});
+  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow_no_lhs_only:
+  // LHS (64), RHS is 128
+  // P0 = LHS * LoRHS
+  // P1 = LHS * HiRHS
+
+  // check sign of LHS:
+  IsNegLHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
+  AbsLHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.lhs");
+  AbsLHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr, LHS,
+                                                   "abs.lhs.select");
+
+  // check sign of RHS:
+  IsNegRHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
+  AbsRHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.rhs");
+  AbsRHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegRHS, AbsRHSIntr, RHS,
+                                                   "abs.rhs.select");
+
+  LoRHS = BuilderNoOverflowLHSonlyBB.CreateAnd(
+      AbsRHS,
+      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
+      "lo.abs.rhs");
+  HiRHS = BuilderNoOverflowLHSonlyBB.CreateLShr(AbsRHS, VTHalfBitWidth,
+                                                "hi.abs.rhs");
+
+  // P0 = (LHS * LoRHS)
+  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, LoRHS,
+                                            "mul.no.overflow.lhs.lorhs");
+  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
+  P0Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
+  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
+
+  // P1 = (LHS * HiRHS)
+  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, HiRHS,
+                                            "mul.no.overflow.lhs.hirhs");
+  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
+  P1Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
+  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
+
+  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
+                                                             "lhs.p0.p1.res");
+  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
+                                                        "lhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
+
+  // sign handling:
+  IsNeg = BuilderNoOverflowLHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
+  Mask = BuilderNoOverflowLHSonlyBB.CreateSExt(IsNeg, LegalTy, "lhs.sign.mask");
+  Add_1 = BuilderNoOverflowLHSonlyBB.CreateZExt(IsNeg, LegalTy, "lhs.add.1");
+  ResLo =
+      BuilderNoOverflowLHSonlyBB.CreateXor(P0Lo, Mask, "lhs.res_lo.xor.mask");
+  ResLo =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResLo, Add_1, "lhs.res_lo.add.1");
+
+  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
+                                               "lhs.check.res_lo.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResMid = BuilderNoOverflowLHSonlyBB.CreateXor(AddOResMid, Mask,
+                                                "lhs.res_mid.xor.mask");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResMid, Carry, "lhs.res_mid.carry");
+
+  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
+                                               Carry, "lhs.check.reslo.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi =
+      BuilderNoOverflowLHSonlyBB.CreateXor(ResHi, Mask, "lhs.res_hi.xor.mask");
+  ResHi =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResHi, Carry, "lhs.res_hi.carry");
+  // Set the final result:
+  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(ResLo, Ty, "lhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(ResMid, Ty, "lhs.res_mid.zext");
+  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                   "lhs.res_mid.shl");
+  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                 "lhs.res_lo.or.mid");
+  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "lhs.check.overflow");
+
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, FinalRes, {0});
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, IsOverflow, {1});
+
+  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow.no:
+  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflow = PoisonValue::get(STy);
+  StructValNoOverflow =
+      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
+  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+
+  // BB overflow.res:
+  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
+  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
+
+  // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
+  // uses by PHINode.
+  I->replaceAllUsesWith(PHINode);
+
+  // BB overflow:
+  PHINode->addIncoming(I, OverflowBB);
+  I->removeFromParent();
+  I->insertInto(OverflowBB, OverflowBB->end());
+  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+
+  // return false to stop reprocessing the function.
+  return false;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9e1c0c1b115ab..e2791f44d0a08 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -261,21 +261,55 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB17_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB17_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    b .LBB17_8
+; CHECK-NEXT:  .LBB17_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB17_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB17_6
+; CHECK-NEXT:  .LBB17_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB17_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    b .LBB17_8
+; CHECK-NEXT:  .LBB17_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB17_8: // %overflow.res
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -289,20 +323,54 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB18_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB18_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
-; CHECK-NEXT:    csinc w2, w8, wzr, lo
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB18_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB18_6
+; CHECK-NEXT:  .LBB18_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB18_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -315,21 +383,54 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x9, x3, x0
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB19_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB19_5
+; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
-; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    adds x9, x12, x11
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    b .LBB19_8
+; CHECK-NEXT:  .LBB19_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB19_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    umulh x10, x0, x3
+; CHECK-NEXT:    mul x11, x0, x3
+; CHECK-NEXT:    mul x12, x1, x3
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    b .LBB19_6
+; CHECK-NEXT:  .LBB19_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x10, x2, x1
+; CHECK-NEXT:    madd x9, x3, x0, x8
+; CHECK-NEXT:    mul x11, x2, x1
+; CHECK-NEXT:    mul x12, x3, x1
+; CHECK-NEXT:    mul x8, x2, x0
+; CHECK-NEXT:  .LBB19_6: // %overflow.res
+; CHECK-NEXT:    adds x9, x9, x11
+; CHECK-NEXT:    adcs xzr, x10, x12
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB19_8
+; CHECK-NEXT:  .LBB19_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB19_8: // %overflow.res
+; CHECK-NEXT:    tst w10, #0x1
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
 ; CHECK-NEXT:    ret
@@ -354,7 +455,14 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB21_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB21_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -364,24 +472,106 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
 ; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
 ; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    asr x11, x8, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, eq
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB21_7
+; CHECK-NEXT:  .LBB21_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB21_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB21_6
+; CHECK-NEXT:  .LBB21_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB21_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB21_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    b .LBB21_9
+; CHECK-NEXT:  .LBB21_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB21_9: // %overflow.res
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -395,7 +585,14 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB22_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB22_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -405,24 +602,104 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
 ; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
 ; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    asr x11, x8, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, ne
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB22_7
+; CHECK-NEXT:  .LBB22_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB22_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB22_6
+; CHECK-NEXT:  .LBB22_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB22_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB22_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -435,7 +712,14 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB23_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB23_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -448,26 +732,106 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
 ; CHECK-NEXT:    adds x9, x14, x10
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    adc x10, x12, x13
-; CHECK-NEXT:    smulh x12, x1, x3
-; CHECK-NEXT:    asr x13, x8, #63
-; CHECK-NEXT:    asr x14, x10, #63
-; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    adds x8, x11, x8
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    mul x13, x0, x2
-; CHECK-NEXT:    adc x10, x12, x10
-; CHECK-NEXT:    eor x12, x3, x1
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x10, x10, x11
-; CHECK-NEXT:    asr x11, x12, #63
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    eor x10, x11, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csinv x0, x13, x11, eq
-; CHECK-NEXT:    csel x1, x10, x9, ne
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    asr x14, x9, #63
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    asr x12, x8, #63
+; CHECK-NEXT:    asr x13, x11, #63
+; CHECK-NEXT:    adds x11, x8, x11
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    adc x12, x12, x13
+; CHECK-NEXT:    adds x11, x15, x11
+; CHECK-NEXT:    adc x10, x10, x12
+; CHECK-NEXT:    cmp x11, x14
+; CHECK-NEXT:    ccmp x10, x14, #0, eq
+; CHECK-NEXT:    b .LBB23_7
+; CHECK-NEXT:  .LBB23_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB23_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB23_6
+; CHECK-NEXT:  .LBB23_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB23_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x11, x8, x11
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    add x8, x13, x10
+; CHECK-NEXT:    adc x13, x9, x15
+; CHECK-NEXT:    eor x9, x11, x12
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x9, x9, lo
+; CHECK-NEXT:    cmp x9, x10
+; CHECK-NEXT:    eor x10, x13, x12
+; CHECK-NEXT:    cinc x10, x10, lo
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:  .LBB23_7: // %overflow.res
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB23_9
+; CHECK-NEXT:  .LBB23_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB23_9: // %overflow.res
+; CHECK-NEXT:    eor x11, x3, x1
+; CHECK-NEXT:    tst w10, #0x1
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
+; CHECK-NEXT:    csinv x0, x8, x11, eq
+; CHECK-NEXT:    csel x1, x12, x9, ne
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 9924b7c63f763..ef004085373cd 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -223,22 +223,49 @@ cleanup:
 
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB4_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB4_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    b.ne .LBB4_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
+; CHECK-NEXT:    b .LBB4_8
+; CHECK-NEXT:  .LBB4_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB4_9
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB4_6
+; CHECK-NEXT:  .LBB4_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB4_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    tbz w8, #0, .LBB4_8
+; CHECK-NEXT:  .LBB4_7: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -247,10 +274,15 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB4_8: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB4_2: // %if.end
+; CHECK-NEXT:  .LBB4_9: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
+; CHECK-NEXT:    b .LBB4_8
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
@@ -272,35 +304,115 @@ cleanup:
 
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB5_5
+; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
+; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB5_7
+; CHECK-NEXT:  .LBB5_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB5_10
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB5_6
+; CHECK-NEXT:  .LBB5_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
 ; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    umulh x11, x0, x2
-; CHECK-NEXT:    asr x14, x3, #63
-; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mul x12, x1, x2
-; CHECK-NEXT:    umulh x9, x1, x2
-; CHECK-NEXT:    mul x10, x10, x2
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    mul x15, x0, x3
-; CHECK-NEXT:    umulh x13, x0, x3
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    mul x14, x0, x14
-; CHECK-NEXT:    mul x16, x1, x3
-; CHECK-NEXT:    adds x1, x15, x11
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    smulh x8, x8, x3
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    adc x10, x11, x12
-; CHECK-NEXT:    adds x9, x16, x9
-; CHECK-NEXT:    asr x11, x1, #63
-; CHECK-NEXT:    adc x8, x8, x10
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB5_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB5_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    tbz w8, #0, .LBB5_9
+; CHECK-NEXT:  .LBB5_8: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -309,10 +421,16 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB5_9: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:  .LBB5_10: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbnz w8, #0, .LBB5_8
+; CHECK-NEXT:    b .LBB5_9
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index edfd80b4f2706..a240055b3f655 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -3,20 +3,54 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
-; AARCH:       // %bb.0: // %start
+; AARCH:       // %bb.0: // %overflow.entry
+; AARCH-NEXT:    cbz x1, .LBB0_3
+; AARCH-NEXT:  // %bb.1: // %overflow.lhs
+; AARCH-NEXT:    cbz x3, .LBB0_5
+; AARCH-NEXT:  // %bb.2: // %overflow
 ; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
-; AARCH-NEXT:    umulh x8, x1, x2
-; AARCH-NEXT:    umulh x10, x3, x0
+; AARCH-NEXT:    umulh x10, x1, x2
+; AARCH-NEXT:    umulh x8, x3, x0
 ; AARCH-NEXT:    madd x9, x1, x2, x9
-; AARCH-NEXT:    ccmp xzr, x8, #0, eq
-; AARCH-NEXT:    umulh x11, x0, x2
 ; AARCH-NEXT:    ccmp xzr, x10, #0, eq
+; AARCH-NEXT:    umulh x11, x0, x2
+; AARCH-NEXT:    ccmp xzr, x8, #0, eq
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    cset w8, ne
 ; AARCH-NEXT:    adds x1, x11, x9
-; AARCH-NEXT:    csinc w2, w8, wzr, lo
+; AARCH-NEXT:    csinc w8, w8, wzr, lo
+; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_3: // %overflow.no.lhs
+; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    cbz x3, .LBB0_7
+; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; AARCH-NEXT:    madd x8, x1, x2, x8
+; AARCH-NEXT:    umulh x9, x0, x3
+; AARCH-NEXT:    mul x10, x0, x3
+; AARCH-NEXT:    mul x11, x1, x3
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    b .LBB0_6
+; AARCH-NEXT:  .LBB0_5: // %overflow.no.rhs.only
+; AARCH-NEXT:    umulh x8, x2, x0
+; AARCH-NEXT:    umulh x9, x2, x1
+; AARCH-NEXT:    madd x8, x3, x0, x8
+; AARCH-NEXT:    mul x10, x2, x1
+; AARCH-NEXT:    mul x11, x3, x1
+; AARCH-NEXT:    mul x0, x2, x0
+; AARCH-NEXT:  .LBB0_6: // %overflow.res
+; AARCH-NEXT:    adds x1, x8, x10
+; AARCH-NEXT:    adcs xzr, x9, x11
+; AARCH-NEXT:    cset w8, ne
+; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_7: // %overflow.no
+; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    madd x1, x1, x2, x8
+; AARCH-NEXT:    and w2, wzr, #0x1
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
@@ -34,46 +68,133 @@ start:
 
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
-; AARCH:       // %bb.0: // %Entry
-; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    asr x9, x3, #63
-; AARCH-NEXT:    umulh x12, x0, x2
-; AARCH-NEXT:    mov x8, x1
+; AARCH:       // %bb.0: // %overflow.entry
+; AARCH-NEXT:    asr x8, x2, #63
+; AARCH-NEXT:    cmp x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    mul x13, x1, x2
-; AARCH-NEXT:    umulh x10, x1, x2
-; AARCH-NEXT:    mul x11, x11, x2
-; AARCH-NEXT:    adds x12, x13, x12
-; AARCH-NEXT:    mul x15, x0, x3
-; AARCH-NEXT:    umulh x14, x0, x3
-; AARCH-NEXT:    adc x10, x10, x11
-; AARCH-NEXT:    mul x9, x0, x9
-; AARCH-NEXT:    mul x16, x1, x3
-; AARCH-NEXT:    adds x1, x15, x12
-; AARCH-NEXT:    asr x12, x10, #63
-; AARCH-NEXT:    smulh x11, x8, x3
-; AARCH-NEXT:    adc x9, x14, x9
-; AARCH-NEXT:    asr x13, x9, #63
-; AARCH-NEXT:    adds x9, x10, x9
-; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    b.eq .LBB1_3
+; AARCH-NEXT:  // %bb.1: // %overflow.lhs
+; AARCH-NEXT:    cmp x3, x8
+; AARCH-NEXT:    b.eq .LBB1_5
+; AARCH-NEXT:  // %bb.2: // %overflow
+; AARCH-NEXT:    asr x9, x1, #63
+; AARCH-NEXT:    umulh x10, x0, x2
+; AARCH-NEXT:    asr x13, x3, #63
+; AARCH-NEXT:    mul x11, x1, x2
+; AARCH-NEXT:    umulh x8, x1, x2
+; AARCH-NEXT:    mul x9, x9, x2
+; AARCH-NEXT:    adds x10, x11, x10
+; AARCH-NEXT:    mul x14, x0, x3
+; AARCH-NEXT:    umulh x12, x0, x3
+; AARCH-NEXT:    adc x9, x8, x9
+; AARCH-NEXT:    mul x13, x0, x13
+; AARCH-NEXT:    adds x8, x14, x10
+; AARCH-NEXT:    mul x15, x1, x3
+; AARCH-NEXT:    smulh x10, x1, x3
+; AARCH-NEXT:    adc x11, x12, x13
+; AARCH-NEXT:    asr x12, x9, #63
+; AARCH-NEXT:    asr x13, x11, #63
 ; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    adds x9, x9, x11
+; AARCH-NEXT:    asr x11, x8, #63
 ; AARCH-NEXT:    adc x12, x12, x13
-; AARCH-NEXT:    adds x9, x16, x9
-; AARCH-NEXT:    adc x11, x11, x12
-; AARCH-NEXT:    cmp x9, x10
-; AARCH-NEXT:    ccmp x11, x10, #0, eq
+; AARCH-NEXT:    adds x9, x15, x9
+; AARCH-NEXT:    adc x10, x10, x12
+; AARCH-NEXT:    cmp x9, x11
+; AARCH-NEXT:    ccmp x10, x11, #0, eq
+; AARCH-NEXT:    b .LBB1_7
+; AARCH-NEXT:  .LBB1_3: // %overflow.no.lhs
+; AARCH-NEXT:    cmp x3, x8
+; AARCH-NEXT:    b.eq .LBB1_8
+; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; AARCH-NEXT:    asr x8, x1, #63
+; AARCH-NEXT:    asr x10, x3, #63
+; AARCH-NEXT:    eor x9, x0, x8
+; AARCH-NEXT:    eor x11, x1, x8
+; AARCH-NEXT:    eor x12, x2, x10
+; AARCH-NEXT:    subs x9, x9, x8
+; AARCH-NEXT:    sbc x8, x11, x8
+; AARCH-NEXT:    cmp x1, #0
+; AARCH-NEXT:    eor x11, x3, x10
+; AARCH-NEXT:    cset w13, lt
+; AARCH-NEXT:    csel x8, x8, x1, lt
+; AARCH-NEXT:    csel x9, x9, x0, lt
+; AARCH-NEXT:    subs x12, x12, x10
+; AARCH-NEXT:    sbc x10, x11, x10
+; AARCH-NEXT:    cmp x3, #0
+; AARCH-NEXT:    csel x11, x12, x2, lt
+; AARCH-NEXT:    csel x10, x10, x3, lt
+; AARCH-NEXT:    umulh x12, x9, x11
+; AARCH-NEXT:    mul x15, x8, x10
+; AARCH-NEXT:    madd x8, x8, x11, x12
+; AARCH-NEXT:    cset w12, lt
+; AARCH-NEXT:    mul x14, x9, x11
+; AARCH-NEXT:    mul x11, x9, x10
+; AARCH-NEXT:    umulh x9, x9, x10
+; AARCH-NEXT:    eor w10, w12, w13
+; AARCH-NEXT:    b .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %overflow.no.rhs.only
+; AARCH-NEXT:    asr x8, x3, #63
+; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    eor x9, x2, x8
+; AARCH-NEXT:    eor x11, x3, x8
+; AARCH-NEXT:    eor x12, x0, x10
+; AARCH-NEXT:    subs x9, x9, x8
+; AARCH-NEXT:    sbc x8, x11, x8
+; AARCH-NEXT:    cmp x3, #0
+; AARCH-NEXT:    eor x11, x1, x10
+; AARCH-NEXT:    cset w13, lt
+; AARCH-NEXT:    csel x8, x8, x3, lt
+; AARCH-NEXT:    csel x9, x9, x2, lt
+; AARCH-NEXT:    subs x12, x12, x10
+; AARCH-NEXT:    sbc x10, x11, x10
+; AARCH-NEXT:    cmp x1, #0
+; AARCH-NEXT:    csel x11, x12, x0, lt
+; AARCH-NEXT:    csel x10, x10, x1, lt
+; AARCH-NEXT:    umulh x12, x9, x11
+; AARCH-NEXT:    mul x14, x9, x11
+; AARCH-NEXT:    mul x15, x8, x10
+; AARCH-NEXT:    madd x8, x8, x11, x12
+; AARCH-NEXT:    cset w12, lt
+; AARCH-NEXT:    mul x11, x9, x10
+; AARCH-NEXT:    umulh x9, x9, x10
+; AARCH-NEXT:    eor w10, w13, w12
+; AARCH-NEXT:  .LBB1_6: // %overflow.res
+; AARCH-NEXT:    sbfx x12, x10, #0, #1
+; AARCH-NEXT:    adds x8, x8, x11
+; AARCH-NEXT:    adc x9, x9, x15
+; AARCH-NEXT:    eor x13, x14, x12
+; AARCH-NEXT:    eor x8, x8, x12
+; AARCH-NEXT:    eor x9, x9, x12
+; AARCH-NEXT:    add x0, x13, x10
+; AARCH-NEXT:    cmp x0, x10
+; AARCH-NEXT:    cset w10, lo
+; AARCH-NEXT:    cinc x8, x8, lo
+; AARCH-NEXT:    cmp x8, x10
+; AARCH-NEXT:    cinc x9, x9, lo
+; AARCH-NEXT:    cmp x9, #0
+; AARCH-NEXT:  .LBB1_7: // %overflow.res
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbz x8, #63, .LBB1_2
-; AARCH-NEXT:  // %bb.1: // %Entry
-; AARCH-NEXT:    eor x8, x3, #0x8000000000000000
-; AARCH-NEXT:    orr x8, x2, x8
-; AARCH-NEXT:    cbz x8, .LBB1_3
-; AARCH-NEXT:  .LBB1_2: // %Else2
-; AARCH-NEXT:    cbz w9, .LBB1_4
-; AARCH-NEXT:  .LBB1_3: // %Then7
-; AARCH-NEXT:    mov w8, #1 // =0x1
-; AARCH-NEXT:    str w8, [x4]
-; AARCH-NEXT:  .LBB1_4: // %Block9
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_9
+; AARCH-NEXT:    b .LBB1_10
+; AARCH-NEXT:  .LBB1_8: // %overflow.no
+; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    madd x8, x1, x2, x8
+; AARCH-NEXT:    tbz x1, #63, .LBB1_10
+; AARCH-NEXT:  .LBB1_9: // %overflow.res
+; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
+; AARCH-NEXT:    orr x10, x2, x10
+; AARCH-NEXT:    cbz x10, .LBB1_11
+; AARCH-NEXT:  .LBB1_10: // %Else2
+; AARCH-NEXT:    tbz w9, #0, .LBB1_12
+; AARCH-NEXT:  .LBB1_11: // %Then7
+; AARCH-NEXT:    mov w9, #1 // =0x1
+; AARCH-NEXT:    str w9, [x4]
+; AARCH-NEXT:  .LBB1_12: // %Block9
+; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:
   store i32 0, ptr %2, align 4
diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
index 4eb82c80e2bff..8f35b6df7a937 100644
--- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
@@ -4,212 +4,425 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: muloti_test:
-; ARMV6:       @ %bb.0: @ %start
+; ARMV6:       @ %bb.0: @ %overflow.entry
 ; ARMV6-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ARMV6-NEXT:    sub sp, sp, #28
-; ARMV6-NEXT:    ldr r4, [sp, #72]
-; ARMV6-NEXT:    mov r7, r0
-; ARMV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r12, [sp, #64]
-; ARMV6-NEXT:    umull r1, r0, r2, r4
+; ARMV6-NEXT:    add lr, sp, #76
 ; ARMV6-NEXT:    ldr r5, [sp, #68]
-; ARMV6-NEXT:    str r1, [r7]
-; ARMV6-NEXT:    ldr r1, [sp, #76]
-; ARMV6-NEXT:    umull r7, r6, r1, r12
-; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; ARMV6-NEXT:    umull r6, r9, r5, r4
-; ARMV6-NEXT:    add r7, r6, r7
-; ARMV6-NEXT:    umull r4, r6, r12, r4
-; ARMV6-NEXT:    str r4, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    mov r4, #0
-; ARMV6-NEXT:    adds r8, r6, r7
-; ARMV6-NEXT:    ldr r6, [sp, #80]
-; ARMV6-NEXT:    adc r7, r4, #0
-; ARMV6-NEXT:    ldr r4, [sp, #84]
-; ARMV6-NEXT:    str r7, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    umull r12, lr, r3, r6
-; ARMV6-NEXT:    umull r11, r7, r4, r2
-; ARMV6-NEXT:    add r12, r11, r12
-; ARMV6-NEXT:    umull r11, r10, r6, r2
-; ARMV6-NEXT:    adds r12, r10, r12
-; ARMV6-NEXT:    mov r10, #0
-; ARMV6-NEXT:    adc r6, r10, #0
-; ARMV6-NEXT:    str r6, [sp, #20] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    adds r6, r6, r11
-; ARMV6-NEXT:    str r6, [sp, #12] @ 4-byte Spill
-; ARMV6-NEXT:    adc r6, r8, r12
-; ARMV6-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r6, [sp, #72]
-; ARMV6-NEXT:    mov r12, #0
-; ARMV6-NEXT:    umull r2, r8, r2, r1
-; ARMV6-NEXT:    umlal r0, r12, r3, r6
-; ARMV6-NEXT:    adds r0, r2, r0
-; ARMV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r8, r12, r8
-; ARMV6-NEXT:    adc r12, r10, #0
-; ARMV6-NEXT:    cmp lr, #0
-; ARMV6-NEXT:    str r0, [r2, #4]
-; ARMV6-NEXT:    movne lr, #1
-; ARMV6-NEXT:    ldr r11, [sp, #8] @ 4-byte Reload
-; ARMV6-NEXT:    cmp r7, #0
-; ARMV6-NEXT:    movne r7, #1
-; ARMV6-NEXT:    ldr r0, [sp, #64]
-; ARMV6-NEXT:    cmp r11, #0
-; ARMV6-NEXT:    umlal r8, r12, r3, r1
-; ARMV6-NEXT:    movne r11, #1
-; ARMV6-NEXT:    cmp r9, #0
-; ARMV6-NEXT:    movne r9, #1
-; ARMV6-NEXT:    orrs r10, r0, r5
-; ARMV6-NEXT:    ldr r0, [sp, #80]
+; ARMV6-NEXT:    ldr r6, [sp, #64]
+; ARMV6-NEXT:    mov r9, r0
+; ARMV6-NEXT:    ldr r11, [sp, #72]
+; ARMV6-NEXT:    orrs r10, r6, r5
+; ARMV6-NEXT:    ldm lr, {r1, r12, lr}
+; ARMV6-NEXT:    beq .LBB0_3
+; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV6-NEXT:    orrs r8, r12, lr
+; ARMV6-NEXT:    beq .LBB0_5
+; ARMV6-NEXT:  @ %bb.2: @ %overflow
+; ARMV6-NEXT:    umull r4, r0, r3, r12
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    umull r7, r0, lr, r2
+; ARMV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; ARMV6-NEXT:    umull r0, r12, r12, r2
+; ARMV6-NEXT:    add r4, r7, r4
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adds r7, r12, r4
+; ARMV6-NEXT:    str r7, [sp] @ 4-byte Spill
+; ARMV6-NEXT:    adc r0, r0, #0
+; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r12, r1, r6
+; ARMV6-NEXT:    umull r7, r4, r5, r0
+; ARMV6-NEXT:    add r7, r7, r11
+; ARMV6-NEXT:    umull r11, r6, r6, r0
+; ARMV6-NEXT:    adds r6, r6, r7
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    adc r7, r7, #0
+; ARMV6-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    adds r7, r11, r7
+; ARMV6-NEXT:    str r7, [sp, #8] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; ARMV6-NEXT:    adc r6, r6, r7
+; ARMV6-NEXT:    str r6, [sp] @ 4-byte Spill
+; ARMV6-NEXT:    umull r11, r6, r2, r0
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umlal r6, r7, r3, r0
+; ARMV6-NEXT:    umull r2, r0, r2, r1
+; ARMV6-NEXT:    adds r2, r2, r6
+; ARMV6-NEXT:    str r2, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r0, r7, r0
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    adc r6, r7, #0
+; ARMV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r0, r6, r3, r1
+; ARMV6-NEXT:    adds r2, r0, r2
+; ARMV6-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r6, r0
+; ARMV6-NEXT:    adc r6, r7, #0
+; ARMV6-NEXT:    cmp r8, #0
+; ARMV6-NEXT:    movne r8, #1
+; ARMV6-NEXT:    cmp r10, #0
 ; ARMV6-NEXT:    movne r10, #1
-; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
-; ARMV6-NEXT:    orrs r0, r0, r4
-; ARMV6-NEXT:    movne r0, #1
 ; ARMV6-NEXT:    cmp r4, #0
 ; ARMV6-NEXT:    movne r4, #1
-; ARMV6-NEXT:    cmp r3, #0
-; ARMV6-NEXT:    movne r3, #1
-; ARMV6-NEXT:    cmp r5, #0
-; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    cmp r1, #0
 ; ARMV6-NEXT:    movne r1, #1
-; ARMV6-NEXT:    adds r6, r8, r6
-; ARMV6-NEXT:    str r6, [r2, #8]
+; ARMV6-NEXT:    cmp r5, #0
+; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    and r1, r5, r1
+; ARMV6-NEXT:    cmp r12, #0
+; ARMV6-NEXT:    orr r1, r1, r4
+; ARMV6-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; ARMV6-NEXT:    movne r12, #1
+; ARMV6-NEXT:    orr r1, r1, r12
+; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; ARMV6-NEXT:    and r6, r10, r8
+; ARMV6-NEXT:    orr r1, r1, r5
+; ARMV6-NEXT:    orr r1, r6, r1
+; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    cmp r6, #0
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    movne r3, #1
+; ARMV6-NEXT:    cmp lr, #0
+; ARMV6-NEXT:    movne lr, #1
+; ARMV6-NEXT:    and r3, lr, r3
+; ARMV6-NEXT:    orr r3, r3, r6
+; ARMV6-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    cmp r6, #0
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    orr r3, r3, r6
 ; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, r9
-; ARMV6-NEXT:    orr r1, r1, r11
-; ARMV6-NEXT:    and r0, r10, r0
-; ARMV6-NEXT:    adcs r6, r12, r6
-; ARMV6-NEXT:    str r6, [r2, #12]
-; ARMV6-NEXT:    ldr r6, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, r6
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    and r1, r4, r3
-; ARMV6-NEXT:    orr r1, r1, r7
-; ARMV6-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, lr
+; ARMV6-NEXT:    orr r3, r3, r6
 ; ARMV6-NEXT:    orr r1, r1, r3
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    mov r1, #0
-; ARMV6-NEXT:    adc r1, r1, #0
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    and r0, r0, #1
-; ARMV6-NEXT:    strb r0, [r2, #16]
+; ARMV6-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    orr r6, r1, r3
+; ARMV6-NEXT:    b .LBB0_8
+; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV6-NEXT:    orrs r6, r12, lr
+; ARMV6-NEXT:    beq .LBB0_7
+; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV6-NEXT:    umull r0, r4, r2, r12
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    mov r10, #0
+; ARMV6-NEXT:    umlal r4, r7, r3, r12
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    umull r6, r8, r2, lr
+; ARMV6-NEXT:    adds r0, r6, r4
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r6, r7, r8
+; ARMV6-NEXT:    adc r7, r10, #0
+; ARMV6-NEXT:    ldr r10, [sp, #64]
+; ARMV6-NEXT:    umlal r6, r7, r3, lr
+; ARMV6-NEXT:    umull r0, r8, r12, r10
+; ARMV6-NEXT:    mla r4, r12, r5, r8
+; ARMV6-NEXT:    mov r8, r11
+; ARMV6-NEXT:    adds r12, r6, r0
+; ARMV6-NEXT:    mov r6, #0
+; ARMV6-NEXT:    mla r4, lr, r10, r4
+; ARMV6-NEXT:    adc lr, r7, r4
+; ARMV6-NEXT:    umull r11, r4, r2, r11
+; ARMV6-NEXT:    umlal r4, r6, r3, r8
+; ARMV6-NEXT:    umull r2, r0, r2, r1
+; ARMV6-NEXT:    adds r7, r2, r4
+; ARMV6-NEXT:    adcs r2, r6, r0
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adc r4, r0, #0
+; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r2, r4, r3, r1
+; ARMV6-NEXT:    umull r3, r6, r8, r10
+; ARMV6-NEXT:    mla r5, r8, r5, r6
+; ARMV6-NEXT:    adds r2, r2, r3
+; ARMV6-NEXT:    mla r1, r1, r10, r5
+; ARMV6-NEXT:    adc r1, r4, r1
+; ARMV6-NEXT:    adds r2, r2, r0
+; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r1, r0
+; ARMV6-NEXT:    adcs r1, r12, #0
+; ARMV6-NEXT:    adc r3, lr, #0
+; ARMV6-NEXT:    b .LBB0_6
+; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV6-NEXT:    mov r10, r6
+; ARMV6-NEXT:    umull r0, r6, r11, r6
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umlal r6, r7, r1, r10
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    umull r4, r8, r11, r5
+; ARMV6-NEXT:    adds r0, r4, r6
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r6, r7, r8
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adc r7, r0, #0
+; ARMV6-NEXT:    umull r0, r8, r10, r12
+; ARMV6-NEXT:    mla r4, r10, lr, r8
+; ARMV6-NEXT:    umlal r6, r7, r1, r5
+; ARMV6-NEXT:    mla r4, r5, r12, r4
+; ARMV6-NEXT:    adds r10, r6, r0
+; ARMV6-NEXT:    adc r0, r7, r4
+; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r6, r11, r2
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umull r4, r5, r0, r3
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    umlal r6, r7, r1, r2
+; ARMV6-NEXT:    adds r8, r4, r6
+; ARMV6-NEXT:    adcs r4, r7, r5
+; ARMV6-NEXT:    adc r5, r0, #0
+; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r4, r5, r1, r3
+; ARMV6-NEXT:    mov r7, r8
+; ARMV6-NEXT:    umull r1, r6, r2, r12
+; ARMV6-NEXT:    mla r2, r2, lr, r6
+; ARMV6-NEXT:    adds r1, r4, r1
+; ARMV6-NEXT:    mla r2, r3, r12, r2
+; ARMV6-NEXT:    adc r3, r5, r2
+; ARMV6-NEXT:    adds r2, r1, r0
+; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r3, r0
+; ARMV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r1, r10, #0
+; ARMV6-NEXT:    adc r3, r3, #0
+; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV6-NEXT:    orrs r6, r1, r3
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    b .LBB0_8
+; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r8, r2, r11
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    mov r6, #0
+; ARMV6-NEXT:    umlal r8, r7, r3, r0
+; ARMV6-NEXT:    umull r4, r10, r2, r1
+; ARMV6-NEXT:    adds r0, r4, r8
+; ARMV6-NEXT:    ldr r4, [sp, #64]
+; ARMV6-NEXT:    adcs r10, r7, r10
+; ARMV6-NEXT:    ldr r7, [sp, #72]
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    adc r0, r6, #0
+; ARMV6-NEXT:    umlal r10, r0, r3, r1
+; ARMV6-NEXT:    umull r8, r4, r7, r4
+; ARMV6-NEXT:    mla r4, r7, r5, r4
+; ARMV6-NEXT:    ldr r5, [sp, #64]
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    mla r1, r1, r5, r4
+; ARMV6-NEXT:    umull r4, r5, r12, r2
+; ARMV6-NEXT:    mla r3, r12, r3, r5
+; ARMV6-NEXT:    mla r2, lr, r2, r3
+; ARMV6-NEXT:    adds r3, r4, r8
+; ARMV6-NEXT:    adc r1, r2, r1
+; ARMV6-NEXT:    adds r2, r10, r3
+; ARMV6-NEXT:    adc r0, r0, r1
+; ARMV6-NEXT:  .LBB0_8: @ %overflow.res
+; ARMV6-NEXT:    str r11, [r9]
+; ARMV6-NEXT:    str r7, [r9, #4]
+; ARMV6-NEXT:    str r2, [r9, #8]
+; ARMV6-NEXT:    str r0, [r9, #12]
+; ARMV6-NEXT:    and r0, r6, #1
+; ARMV6-NEXT:    strb r0, [r9, #16]
 ; ARMV6-NEXT:    add sp, sp, #28
 ; ARMV6-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ;
 ; ARMV7-LABEL: muloti_test:
-; ARMV7:       @ %bb.0: @ %start
+; ARMV7:       @ %bb.0: @ %overflow.entry
 ; ARMV7-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; ARMV7-NEXT:    sub sp, sp, #44
-; ARMV7-NEXT:    ldr r8, [sp, #88]
-; ARMV7-NEXT:    mov r9, r0
-; ARMV7-NEXT:    ldr r7, [sp, #96]
-; ARMV7-NEXT:    ldr lr, [sp, #100]
-; ARMV7-NEXT:    umull r0, r5, r2, r8
-; ARMV7-NEXT:    ldr r4, [sp, #80]
-; ARMV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; ARMV7-NEXT:    umull r1, r0, r3, r7
-; ARMV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMV7-NEXT:    umull r0, r11, lr, r2
-; ARMV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r1, [sp, #92]
-; ARMV7-NEXT:    str r0, [sp] @ 4-byte Spill
-; ARMV7-NEXT:    umull r0, r10, r7, r2
-; ARMV7-NEXT:    mov r7, r1
-; ARMV7-NEXT:    umull r6, r12, r1, r4
-; ARMV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r0, [sp, #84]
-; ARMV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
-; ARMV7-NEXT:    umull r6, r1, r0, r8
-; ARMV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; ARMV7-NEXT:    umull r6, r2, r2, r7
-; ARMV7-NEXT:    mov r7, r4
-; ARMV7-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; ARMV7-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; ARMV7-NEXT:    umull r2, r6, r4, r8
-; ARMV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; ARMV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
-; ARMV7-NEXT:    mov r6, #0
-; ARMV7-NEXT:    str r2, [r9]
-; ARMV7-NEXT:    umlal r5, r6, r3, r8
-; ARMV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; ARMV7-NEXT:    add r4, r4, r2
-; ARMV7-NEXT:    adds r2, r10, r4
-; ARMV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
-; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    cmp r12, #0
-; ARMV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
-; ARMV7-NEXT:    movwne r12, #1
+; ARMV7-NEXT:    sub sp, sp, #12
+; ARMV7-NEXT:    ldr r7, [sp, #52]
+; ARMV7-NEXT:    ldr r10, [sp, #48]
+; ARMV7-NEXT:    ldr r4, [sp, #68]
+; ARMV7-NEXT:    ldr r9, [sp, #64]
+; ARMV7-NEXT:    orrs r1, r10, r7
+; ARMV7-NEXT:    ldr r12, [sp, #60]
+; ARMV7-NEXT:    ldr lr, [sp, #56]
+; ARMV7-NEXT:    beq .LBB0_3
+; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV7-NEXT:    orr r5, r9, r4
+; ARMV7-NEXT:    cmp r5, #0
+; ARMV7-NEXT:    beq .LBB0_5
+; ARMV7-NEXT:  @ %bb.2: @ %overflow
+; ARMV7-NEXT:    movwne r5, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    ldr r2, [sp, #96]
+; ARMV7-NEXT:    mov r6, r12
 ; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    orrs r10, r7, r0
-; ARMV7-NEXT:    movwne r10, #1
-; ARMV7-NEXT:    orrs r7, r2, lr
-; ARMV7-NEXT:    ldr r2, [sp, #92]
+; ARMV7-NEXT:    and r12, r1, r5
+; ARMV7-NEXT:    cmp r6, #0
+; ARMV7-NEXT:    mov r1, r6
+; ARMV7-NEXT:    mov r8, r6
+; ARMV7-NEXT:    umull r6, r5, r7, lr
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    cmp r7, #0
 ; ARMV7-NEXT:    movwne r7, #1
-; ARMV7-NEXT:    cmp r0, #0
-; ARMV7-NEXT:    movwne r0, #1
-; ARMV7-NEXT:    cmp r2, #0
-; ARMV7-NEXT:    mov r4, r2
-; ARMV7-NEXT:    mov r8, r2
-; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    and r1, r7, r1
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    cmp r5, #0
+; ARMV7-NEXT:    movwne r5, #1
+; ARMV7-NEXT:    orr r1, r1, r5
+; ARMV7-NEXT:    umull r5, r7, r8, r10
+; ARMV7-NEXT:    cmp r7, #0
+; ARMV7-NEXT:    movwne r7, #1
+; ARMV7-NEXT:    orr r7, r1, r7
+; ARMV7-NEXT:    add r1, r6, r5
+; ARMV7-NEXT:    umull r8, r6, r10, lr
+; ARMV7-NEXT:    adds r10, r6, r1
+; ARMV7-NEXT:    umull r6, r1, r4, r2
+; ARMV7-NEXT:    adc r5, r11, #0
+; ARMV7-NEXT:    orr r5, r7, r5
+; ARMV7-NEXT:    orr r7, r12, r5
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    mov r5, r3
+; ARMV7-NEXT:    movwne r5, #1
+; ARMV7-NEXT:    cmp r4, #0
 ; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    and r0, r0, r4
-; ARMV7-NEXT:    mov r4, #0
-; ARMV7-NEXT:    adds r5, r2, r5
-; ARMV7-NEXT:    str r5, [r9, #4]
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; ARMV7-NEXT:    and r5, r10, r7
-; ARMV7-NEXT:    orr r0, r0, r12
-; ARMV7-NEXT:    mov r12, #0
-; ARMV7-NEXT:    add r1, r2, r1
-; ARMV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; ARMV7-NEXT:    adcs r2, r6, r2
-; ARMV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
-; ARMV7-NEXT:    adc r7, r4, #0
-; ARMV7-NEXT:    adds r1, r6, r1
-; ARMV7-NEXT:    umlal r2, r7, r3, r8
-; ARMV7-NEXT:    adc r4, r4, #0
-; ARMV7-NEXT:    orr r0, r0, r4
-; ARMV7-NEXT:    orr r0, r5, r0
-; ARMV7-NEXT:    ldr r4, [sp, #40] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r5, [sp, #36] @ 4-byte Reload
-; ARMV7-NEXT:    adds r5, r5, r4
-; ARMV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; ARMV7-NEXT:    adc r1, r1, r4
-; ARMV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    cmp r1, #0
+; ARMV7-NEXT:    and r5, r4, r5
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    orr r1, r5, r1
+; ARMV7-NEXT:    umull r5, r4, r3, r9
 ; ARMV7-NEXT:    cmp r4, #0
+; ARMV7-NEXT:    add r6, r6, r5
 ; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    cmp r3, #0
-; ARMV7-NEXT:    movwne r3, #1
-; ARMV7-NEXT:    cmp lr, #0
-; ARMV7-NEXT:    movwne lr, #1
-; ARMV7-NEXT:    cmp r11, #0
-; ARMV7-NEXT:    movwne r11, #1
-; ARMV7-NEXT:    adds r2, r2, r5
-; ARMV7-NEXT:    and r3, lr, r3
-; ARMV7-NEXT:    str r2, [r9, #8]
-; ARMV7-NEXT:    adcs r1, r7, r1
-; ARMV7-NEXT:    str r1, [r9, #12]
-; ARMV7-NEXT:    orr r1, r3, r11
-; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
 ; ARMV7-NEXT:    orr r1, r1, r4
-; ARMV7-NEXT:    orr r1, r1, r2
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    adc r1, r12, #0
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    and r0, r0, #1
-; ARMV7-NEXT:    strb r0, [r9, #16]
-; ARMV7-NEXT:    add sp, sp, #44
+; ARMV7-NEXT:    umull r5, r4, r9, r2
+; ARMV7-NEXT:    adds r6, r4, r6
+; ARMV7-NEXT:    adc r4, r11, #0
+; ARMV7-NEXT:    orr r1, r1, r4
+; ARMV7-NEXT:    mov r4, #0
+; ARMV7-NEXT:    orr r12, r7, r1
+; ARMV7-NEXT:    adds r7, r8, r5
+; ARMV7-NEXT:    umull r8, r5, r2, lr
+; ARMV7-NEXT:    adc r6, r10, r6
+; ARMV7-NEXT:    umlal r5, r4, r3, lr
+; ARMV7-NEXT:    ldr lr, [sp, #60]
+; ARMV7-NEXT:    umull r2, r1, r2, lr
+; ARMV7-NEXT:    adds r5, r2, r5
+; ARMV7-NEXT:    adcs r1, r4, r1
+; ARMV7-NEXT:    adc r4, r11, #0
+; ARMV7-NEXT:    umlal r1, r4, r3, lr
+; ARMV7-NEXT:    adds r2, r1, r7
+; ARMV7-NEXT:    adcs r3, r4, r6
+; ARMV7-NEXT:    adc r1, r11, #0
+; ARMV7-NEXT:    orr r1, r12, r1
+; ARMV7-NEXT:    b .LBB0_8
+; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV7-NEXT:    orrs r1, r9, r4
+; ARMV7-NEXT:    beq .LBB0_7
+; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV7-NEXT:    umull r1, r5, r2, r9
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    umlal r5, r6, r3, r9
+; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    umull r1, r8, r2, r4
+; ARMV7-NEXT:    adds r1, r1, r5
+; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    adcs r5, r6, r8
+; ARMV7-NEXT:    adc r6, r11, #0
+; ARMV7-NEXT:    umull r8, r11, r9, r10
+; ARMV7-NEXT:    mla r1, r9, r7, r11
+; ARMV7-NEXT:    umlal r5, r6, r3, r4
+; ARMV7-NEXT:    mla r1, r4, r10, r1
+; ARMV7-NEXT:    adds r4, r5, r8
+; ARMV7-NEXT:    umull r8, r5, r2, lr
+; ARMV7-NEXT:    adc r9, r6, r1
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    umlal r5, r6, r3, lr
+; ARMV7-NEXT:    umull r2, r1, r2, r12
+; ARMV7-NEXT:    adds r5, r2, r5
+; ARMV7-NEXT:    mov r2, #0
+; ARMV7-NEXT:    adcs r1, r6, r1
+; ARMV7-NEXT:    adc r2, r2, #0
+; ARMV7-NEXT:    umlal r1, r2, r3, r12
+; ARMV7-NEXT:    umull r3, r6, lr, r10
+; ARMV7-NEXT:    mla r7, lr, r7, r6
+; ARMV7-NEXT:    adds r1, r1, r3
+; ARMV7-NEXT:    mla r7, r12, r10, r7
+; ARMV7-NEXT:    adc r3, r2, r7
+; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r1, r2
+; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    adcs r3, r3, r1
+; ARMV7-NEXT:    adcs r1, r4, #0
+; ARMV7-NEXT:    adc r7, r9, #0
+; ARMV7-NEXT:    b .LBB0_6
+; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV7-NEXT:    umull r1, r5, lr, r10
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    umull r6, r8, lr, r7
+; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    mov r1, #0
+; ARMV7-NEXT:    umlal r5, r1, r12, r10
+; ARMV7-NEXT:    adds r5, r6, r5
+; ARMV7-NEXT:    str r5, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    adcs r1, r1, r8
+; ARMV7-NEXT:    adc r5, r11, #0
+; ARMV7-NEXT:    umull r8, r11, r10, r9
+; ARMV7-NEXT:    mla r6, r10, r4, r11
+; ARMV7-NEXT:    umlal r1, r5, r12, r7
+; ARMV7-NEXT:    mla r6, r7, r9, r6
+; ARMV7-NEXT:    mov r7, #0
+; ARMV7-NEXT:    adds r10, r1, r8
+; ARMV7-NEXT:    adc r11, r5, r6
+; ARMV7-NEXT:    umull r8, r5, lr, r2
+; ARMV7-NEXT:    umlal r5, r7, r12, r2
+; ARMV7-NEXT:    umull r1, r6, lr, r3
+; ARMV7-NEXT:    adds r5, r1, r5
+; ARMV7-NEXT:    adcs r1, r7, r6
+; ARMV7-NEXT:    mov r7, #0
+; ARMV7-NEXT:    adc r7, r7, #0
+; ARMV7-NEXT:    umlal r1, r7, r12, r3
+; ARMV7-NEXT:    umull r12, r6, r2, r9
+; ARMV7-NEXT:    mla r2, r2, r4, r6
+; ARMV7-NEXT:    adds r1, r1, r12
+; ARMV7-NEXT:    mla r2, r3, r9, r2
+; ARMV7-NEXT:    adc r3, r7, r2
+; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r1, r2
+; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    adcs r3, r3, r1
+; ARMV7-NEXT:    adcs r1, r10, #0
+; ARMV7-NEXT:    adc r7, r11, #0
+; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV7-NEXT:    orrs r1, r1, r7
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    b .LBB0_8
+; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV7-NEXT:    umull r1, r11, r2, lr
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    umlal r11, r6, r3, lr
+; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    mov r1, #0
+; ARMV7-NEXT:    umull r5, r8, r2, r12
+; ARMV7-NEXT:    adds r5, r5, r11
+; ARMV7-NEXT:    adcs r6, r6, r8
+; ARMV7-NEXT:    adc r11, r1, #0
+; ARMV7-NEXT:    umlal r6, r11, r3, r12
+; ARMV7-NEXT:    umull r8, r12, lr, r10
+; ARMV7-NEXT:    str r6, [sp] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r6, [sp, #60]
+; ARMV7-NEXT:    mla r7, lr, r7, r12
+; ARMV7-NEXT:    str r8, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r8, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    mla r12, r6, r10, r7
+; ARMV7-NEXT:    umull lr, r7, r9, r2
+; ARMV7-NEXT:    mla r3, r9, r3, r7
+; ARMV7-NEXT:    mla r2, r4, r2, r3
+; ARMV7-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r3, lr, r3
+; ARMV7-NEXT:    adc r7, r2, r12
+; ARMV7-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r2, r3
+; ARMV7-NEXT:    adc r3, r11, r7
+; ARMV7-NEXT:  .LBB0_8: @ %overflow.res
+; ARMV7-NEXT:    str r8, [r0]
+; ARMV7-NEXT:    and r1, r1, #1
+; ARMV7-NEXT:    str r5, [r0, #4]
+; ARMV7-NEXT:    str r2, [r0, #8]
+; ARMV7-NEXT:    str r3, [r0, #12]
+; ARMV7-NEXT:    strb r1, [r0, #16]
+; ARMV7-NEXT:    add sp, sp, #12
 ; ARMV7-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
index 64d9831442970..91ea1a1ad75e9 100644
--- a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
@@ -4,12 +4,18 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: mulodi_test:
-; ARMV6:       @ %bb.0: @ %start
+; ARMV6:       @ %bb.0: @ %overflow.entry
 ; ARMV6-NEXT:    push {r4, r5, r11, lr}
-; ARMV6-NEXT:    umull r12, lr, r1, r2
-; ARMV6-NEXT:    umull r4, r5, r3, r0
-; ARMV6-NEXT:    cmp lr, #0
-; ARMV6-NEXT:    movne lr, #1
+; ARMV6-NEXT:    cmp r1, #0
+; ARMV6-NEXT:    beq .LBB0_3
+; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    beq .LBB0_5
+; ARMV6-NEXT:  @ %bb.2: @ %overflow
+; ARMV6-NEXT:    umull r12, r4, r1, r2
+; ARMV6-NEXT:    umull lr, r5, r3, r0
+; ARMV6-NEXT:    cmp r4, #0
+; ARMV6-NEXT:    movne r4, #1
 ; ARMV6-NEXT:    cmp r3, #0
 ; ARMV6-NEXT:    movne r3, #1
 ; ARMV6-NEXT:    cmp r1, #0
@@ -17,38 +23,105 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    movne r1, #1
 ; ARMV6-NEXT:    and r1, r1, r3
 ; ARMV6-NEXT:    cmp r5, #0
-; ARMV6-NEXT:    orr r1, r1, lr
+; ARMV6-NEXT:    orr r1, r1, r4
 ; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    orr r3, r1, r5
-; ARMV6-NEXT:    add r1, r12, r4
+; ARMV6-NEXT:    add r1, r12, lr
 ; ARMV6-NEXT:    adds r1, r2, r1
 ; ARMV6-NEXT:    mov r5, #0
 ; ARMV6-NEXT:    adc r2, r5, #0
-; ARMV6-NEXT:    orr r2, r3, r2
+; ARMV6-NEXT:    orr r12, r3, r2
+; ARMV6-NEXT:    and r2, r12, #1
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    beq .LBB0_7
+; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV6-NEXT:    mov lr, r0
+; ARMV6-NEXT:    umull r0, r4, r0, r2
+; ARMV6-NEXT:    mov r12, r1
+; ARMV6-NEXT:    mla r1, r1, r2, r4
+; ARMV6-NEXT:    mul r12, r12, r3
+; ARMV6-NEXT:    umlal r1, r12, lr, r3
+; ARMV6-NEXT:    b .LBB0_6
+; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV6-NEXT:    mov r12, r0
+; ARMV6-NEXT:    umull r0, lr, r2, r0
+; ARMV6-NEXT:    mov r4, r1
+; ARMV6-NEXT:    mla r1, r3, r12, lr
+; ARMV6-NEXT:    mul r12, r3, r4
+; ARMV6-NEXT:    umlal r1, r12, r2, r4
+; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV6-NEXT:    cmp r12, #0
+; ARMV6-NEXT:    movne r12, #1
+; ARMV6-NEXT:    and r2, r12, #1
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV6-NEXT:    mov r12, r0
+; ARMV6-NEXT:    umull r0, r4, r0, r2
+; ARMV6-NEXT:    mla r3, r12, r3, r4
+; ARMV6-NEXT:    mov r12, #0
+; ARMV6-NEXT:    mla r1, r1, r2, r3
+; ARMV6-NEXT:    and r2, r12, #1
 ; ARMV6-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; ARMV7-LABEL: mulodi_test:
-; ARMV7:       @ %bb.0: @ %start
+; ARMV7:       @ %bb.0: @ %overflow.entry
 ; ARMV7-NEXT:    push {r4, r5, r11, lr}
-; ARMV7-NEXT:    umull r12, lr, r3, r0
+; ARMV7-NEXT:    cmp r1, #0
+; ARMV7-NEXT:    beq .LBB0_3
+; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    beq .LBB0_5
+; ARMV7-NEXT:  @ %bb.2: @ %overflow
+; ARMV7-NEXT:    umull lr, r4, r3, r0
 ; ARMV7-NEXT:    cmp r3, #0
 ; ARMV7-NEXT:    movwne r3, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    umull r0, r4, r0, r2
+; ARMV7-NEXT:    umull r0, r12, r0, r2
 ; ARMV7-NEXT:    umull r2, r5, r1, r2
 ; ARMV7-NEXT:    movwne r1, #1
 ; ARMV7-NEXT:    and r1, r1, r3
 ; ARMV7-NEXT:    cmp r5, #0
 ; ARMV7-NEXT:    movwne r5, #1
-; ARMV7-NEXT:    cmp lr, #0
+; ARMV7-NEXT:    cmp r4, #0
 ; ARMV7-NEXT:    orr r1, r1, r5
-; ARMV7-NEXT:    movwne lr, #1
-; ARMV7-NEXT:    orr r3, r1, lr
-; ARMV7-NEXT:    add r1, r2, r12
+; ARMV7-NEXT:    movwne r4, #1
+; ARMV7-NEXT:    orr r3, r1, r4
+; ARMV7-NEXT:    add r1, r2, lr
 ; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adds r1, r4, r1
+; ARMV7-NEXT:    adds r1, r12, r1
 ; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    orr r2, r3, r2
+; ARMV7-NEXT:    orr r12, r3, r2
+; ARMV7-NEXT:    and r2, r12, #1
+; ARMV7-NEXT:    pop {r4, r5, r11, pc}
+; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV7-NEXT:    mov r5, r0
+; ARMV7-NEXT:    umull r0, r4, r0, r2
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    beq .LBB0_7
+; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV7-NEXT:    mul r12, r1, r3
+; ARMV7-NEXT:    mla r1, r1, r2, r4
+; ARMV7-NEXT:    umlal r1, r12, r5, r3
+; ARMV7-NEXT:    b .LBB0_6
+; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV7-NEXT:    mov lr, r0
+; ARMV7-NEXT:    umull r0, r4, r2, r0
+; ARMV7-NEXT:    mov r5, r1
+; ARMV7-NEXT:    mul r12, r3, r1
+; ARMV7-NEXT:    mla r1, r3, lr, r4
+; ARMV7-NEXT:    umlal r1, r12, r2, r5
+; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV7-NEXT:    cmp r12, #0
+; ARMV7-NEXT:    movwne r12, #1
+; ARMV7-NEXT:    and r2, r12, #1
+; ARMV7-NEXT:    pop {r4, r5, r11, pc}
+; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV7-NEXT:    mla r3, r5, r3, r4
+; ARMV7-NEXT:    mov r12, #0
+; ARMV7-NEXT:    mla r1, r1, r2, r3
+; ARMV7-NEXT:    and r2, r12, #1
 ; ARMV7-NEXT:    pop {r4, r5, r11, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
index 968c06136225d..5498a0741bc23 100644
--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -4,7 +4,13 @@
 
 define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-LABEL: smuloi64:
-; LA32:       # %bb.0:
+; LA32:       # %bb.0: # %overflow.entry
+; LA32-NEXT:    srai.w $a6, $a0, 31
+; LA32-NEXT:    srai.w $a5, $a2, 31
+; LA32-NEXT:    beq $a1, $a6, .LBB0_3
+; LA32-NEXT:  # %bb.1: # %overflow.lhs
+; LA32-NEXT:    beq $a3, $a5, .LBB0_6
+; LA32-NEXT:  # %bb.2: # %overflow
 ; LA32-NEXT:    mulh.wu $a5, $a0, $a2
 ; LA32-NEXT:    mul.w $a6, $a1, $a2
 ; LA32-NEXT:    add.w $a5, $a6, $a5
@@ -38,11 +44,138 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-NEXT:    xor $a1, $a1, $a6
 ; LA32-NEXT:    xor $a3, $a3, $a6
 ; LA32-NEXT:    or $a1, $a3, $a1
-; LA32-NEXT:    sltu $a1, $zero, $a1
+; LA32-NEXT:    sltu $a6, $zero, $a1
+; LA32-NEXT:    b .LBB0_9
+; LA32-NEXT:  .LBB0_3: # %overflow.no.lhs
+; LA32-NEXT:    beq $a3, $a5, .LBB0_8
+; LA32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a1, .LBB0_10
+; LA32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a5, $a0
+; LA32-NEXT:    move $a6, $a1
+; LA32-NEXT:    bgez $a1, .LBB0_11
+; LA32-NEXT:    b .LBB0_12
+; LA32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB0_14
+; LA32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a5, $a2
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    bgez $a3, .LBB0_15
+; LA32-NEXT:    b .LBB0_16
+; LA32-NEXT:  .LBB0_8: # %overflow.no
+; LA32-NEXT:    move $a6, $zero
+; LA32-NEXT:    mulh.wu $a5, $a0, $a2
+; LA32-NEXT:    mul.w $a3, $a0, $a3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    mul.w $a1, $a1, $a2
+; LA32-NEXT:    add.w $a5, $a3, $a1
+; LA32-NEXT:  .LBB0_9: # %overflow.res
 ; LA32-NEXT:    mul.w $a0, $a0, $a2
+; LA32-NEXT:    b .LBB0_27
+; LA32-NEXT:  .LBB0_10:
+; LA32-NEXT:    sub.w $a5, $zero, $a0
+; LA32-NEXT:    sltu $a6, $zero, $a0
+; LA32-NEXT:    add.w $a6, $a1, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    bltz $a1, .LBB0_12
+; LA32-NEXT:  .LBB0_11: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a6, $a1
+; LA32-NEXT:    move $a5, $a0
+; LA32-NEXT:  .LBB0_12: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB0_18
+; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a7, $a2
+; LA32-NEXT:    move $a0, $a3
+; LA32-NEXT:    b .LBB0_19
+; LA32-NEXT:  .LBB0_14:
+; LA32-NEXT:    sub.w $a5, $zero, $a2
+; LA32-NEXT:    sltu $a6, $zero, $a2
+; LA32-NEXT:    add.w $a6, $a3, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    bltz $a3, .LBB0_16
+; LA32-NEXT:  .LBB0_15: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $a5, $a2
+; LA32-NEXT:  .LBB0_16: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a1, .LBB0_22
+; LA32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a7, $a0
+; LA32-NEXT:    move $a2, $a1
+; LA32-NEXT:    b .LBB0_23
+; LA32-NEXT:  .LBB0_18:
+; LA32-NEXT:    sub.w $a7, $zero, $a2
+; LA32-NEXT:    sltu $a0, $zero, $a2
+; LA32-NEXT:    add.w $a0, $a3, $a0
+; LA32-NEXT:    sub.w $a0, $zero, $a0
+; LA32-NEXT:  .LBB0_19: # %overflow.no.lhs.only
+; LA32-NEXT:    slti $a1, $a1, 0
+; LA32-NEXT:    slti $t0, $a3, 0
+; LA32-NEXT:    bltz $a3, .LBB0_21
+; LA32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a0, $a3
+; LA32-NEXT:    move $a7, $a2
+; LA32-NEXT:  .LBB0_21: # %overflow.no.lhs.only
+; LA32-NEXT:    mulh.wu $a2, $a5, $a7
+; LA32-NEXT:    mul.w $a3, $a6, $a7
+; LA32-NEXT:    add.w $a2, $a2, $a3
+; LA32-NEXT:    mul.w $a3, $a5, $a7
+; LA32-NEXT:    mul.w $a6, $a6, $a0
+; LA32-NEXT:    mulh.wu $a7, $a5, $a0
+; LA32-NEXT:    add.w $a6, $a7, $a6
+; LA32-NEXT:    mul.w $a0, $a5, $a0
+; LA32-NEXT:    add.w $a5, $a2, $a0
+; LA32-NEXT:    sltu $a0, $a5, $a2
+; LA32-NEXT:    add.w $a2, $a6, $a0
+; LA32-NEXT:    xor $a1, $t0, $a1
+; LA32-NEXT:    sub.w $a6, $zero, $a1
+; LA32-NEXT:    xor $a0, $a3, $a6
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    xor $a3, $a5, $a6
+; LA32-NEXT:    add.w $a5, $a3, $a1
+; LA32-NEXT:    sltu $a1, $a5, $a1
+; LA32-NEXT:    xor $a2, $a2, $a6
+; LA32-NEXT:    b .LBB0_26
+; LA32-NEXT:  .LBB0_22:
+; LA32-NEXT:    sub.w $a7, $zero, $a0
+; LA32-NEXT:    sltu $a2, $zero, $a0
+; LA32-NEXT:    add.w $a2, $a1, $a2
+; LA32-NEXT:    sub.w $a2, $zero, $a2
+; LA32-NEXT:  .LBB0_23: # %overflow.no.rhs.only
+; LA32-NEXT:    slti $a3, $a3, 0
+; LA32-NEXT:    slti $t0, $a1, 0
+; LA32-NEXT:    bltz $a1, .LBB0_25
+; LA32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a2, $a1
+; LA32-NEXT:    move $a7, $a0
+; LA32-NEXT:  .LBB0_25: # %overflow.no.rhs.only
+; LA32-NEXT:    mulh.wu $a0, $a5, $a7
+; LA32-NEXT:    mul.w $a1, $a6, $a7
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    mul.w $a1, $a5, $a7
+; LA32-NEXT:    mul.w $a6, $a6, $a2
+; LA32-NEXT:    mulh.wu $a7, $a5, $a2
+; LA32-NEXT:    add.w $a6, $a7, $a6
+; LA32-NEXT:    mul.w $a2, $a5, $a2
+; LA32-NEXT:    add.w $a2, $a0, $a2
+; LA32-NEXT:    sltu $a0, $a2, $a0
+; LA32-NEXT:    add.w $a6, $a6, $a0
+; LA32-NEXT:    xor $a3, $a3, $t0
+; LA32-NEXT:    sub.w $a7, $zero, $a3
+; LA32-NEXT:    xor $a0, $a1, $a7
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    sltu $a1, $a0, $a3
+; LA32-NEXT:    xor $a2, $a2, $a7
+; LA32-NEXT:    add.w $a5, $a2, $a1
+; LA32-NEXT:    sltu $a1, $a5, $a1
+; LA32-NEXT:    xor $a2, $a6, $a7
+; LA32-NEXT:  .LBB0_26: # %overflow.res
+; LA32-NEXT:    add.w $a1, $a2, $a1
+; LA32-NEXT:    sltu $a6, $zero, $a1
+; LA32-NEXT:  .LBB0_27: # %overflow.res
 ; LA32-NEXT:    st.w $a0, $a4, 0
+; LA32-NEXT:    andi $a0, $a6, 1
 ; LA32-NEXT:    st.w $a5, $a4, 4
-; LA32-NEXT:    move $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi64:
@@ -63,7 +196,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-LABEL: smuloi128:
-; LA32:       # %bb.0:
+; LA32:       # %bb.0: # %overflow.entry
 ; LA32-NEXT:    addi.w $sp, $sp, -48
 ; LA32-NEXT:    .cfi_def_cfa_offset 48
 ; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
@@ -88,198 +221,608 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    .cfi_offset 29, -36
 ; LA32-NEXT:    .cfi_offset 30, -40
 ; LA32-NEXT:    .cfi_offset 31, -44
-; LA32-NEXT:    ld.w $a5, $a1, 12
-; LA32-NEXT:    ld.w $a6, $a1, 8
-; LA32-NEXT:    ld.w $t1, $a0, 4
-; LA32-NEXT:    ld.w $a3, $a1, 0
-; LA32-NEXT:    ld.w $a7, $a0, 8
-; LA32-NEXT:    ld.w $t0, $a0, 12
-; LA32-NEXT:    ld.w $a4, $a0, 0
-; LA32-NEXT:    ld.w $t4, $a1, 4
-; LA32-NEXT:    mulh.wu $a0, $a7, $a3
-; LA32-NEXT:    mul.w $a1, $t0, $a3
-; LA32-NEXT:    add.w $a0, $a1, $a0
-; LA32-NEXT:    sltu $a1, $a0, $a1
-; LA32-NEXT:    mulh.wu $t2, $t0, $a3
-; LA32-NEXT:    add.w $a1, $t2, $a1
-; LA32-NEXT:    mul.w $t3, $a7, $t4
-; LA32-NEXT:    add.w $t2, $t3, $a0
-; LA32-NEXT:    sltu $a0, $t2, $t3
-; LA32-NEXT:    mulh.wu $t3, $a7, $t4
-; LA32-NEXT:    add.w $a0, $t3, $a0
-; LA32-NEXT:    add.w $t5, $a1, $a0
-; LA32-NEXT:    mul.w $t6, $t0, $t4
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    srai.w $a0, $t0, 31
-; LA32-NEXT:    mul.w $t8, $a3, $a0
-; LA32-NEXT:    add.w $t3, $t7, $t8
-; LA32-NEXT:    sltu $fp, $t3, $t7
+; LA32-NEXT:    ld.w $a3, $a1, 12
+; LA32-NEXT:    ld.w $a7, $a1, 8
+; LA32-NEXT:    ld.w $a5, $a1, 0
+; LA32-NEXT:    ld.w $a6, $a0, 0
+; LA32-NEXT:    ld.w $t0, $a0, 4
+; LA32-NEXT:    ld.w $a4, $a0, 12
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    srai.w $t1, $t0, 31
+; LA32-NEXT:    xor $t2, $a4, $t1
+; LA32-NEXT:    xor $t1, $a0, $t1
+; LA32-NEXT:    or $t2, $t1, $t2
+; LA32-NEXT:    srai.w $t1, $a1, 31
+; LA32-NEXT:    beq $t2, $zero, .LBB1_11
+; LA32-NEXT:  # %bb.1: # %overflow.lhs
+; LA32-NEXT:    xor $t2, $a7, $t1
+; LA32-NEXT:    xor $t1, $a3, $t1
+; LA32-NEXT:    or $t1, $t2, $t1
+; LA32-NEXT:    beq $t1, $zero, .LBB1_14
+; LA32-NEXT:  # %bb.2: # %overflow
+; LA32-NEXT:    mulh.wu $t1, $a0, $a5
+; LA32-NEXT:    mul.w $t2, $a4, $a5
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t3, $a4, $a5
+; LA32-NEXT:    add.w $t5, $t3, $t2
+; LA32-NEXT:    mul.w $t3, $a0, $a1
+; LA32-NEXT:    add.w $t2, $t3, $t1
+; LA32-NEXT:    sltu $t1, $t2, $t3
+; LA32-NEXT:    mulh.wu $t3, $a0, $a1
+; LA32-NEXT:    add.w $t1, $t3, $t1
+; LA32-NEXT:    add.w $t1, $t5, $t1
+; LA32-NEXT:    mul.w $t6, $a4, $a1
+; LA32-NEXT:    add.w $t7, $t6, $t1
+; LA32-NEXT:    srai.w $t3, $a4, 31
+; LA32-NEXT:    mul.w $t8, $a5, $t3
+; LA32-NEXT:    add.w $t4, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t4, $t7
 ; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $a1, $t5, $a1
-; LA32-NEXT:    mulh.wu $t5, $t0, $t4
-; LA32-NEXT:    add.w $a1, $t5, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t6
-; LA32-NEXT:    mulh.wu $t5, $a3, $a0
+; LA32-NEXT:    sltu $t1, $t1, $t5
+; LA32-NEXT:    mulh.wu $t5, $a4, $a1
+; LA32-NEXT:    add.w $t1, $t5, $t1
+; LA32-NEXT:    add.w $t1, $t1, $t6
+; LA32-NEXT:    mulh.wu $t5, $a5, $t3
 ; LA32-NEXT:    add.w $t5, $t5, $t8
-; LA32-NEXT:    mul.w $t6, $t4, $a0
+; LA32-NEXT:    mul.w $t6, $a1, $t3
 ; LA32-NEXT:    add.w $t5, $t5, $t6
-; LA32-NEXT:    add.w $t8, $a1, $t5
-; LA32-NEXT:    mulh.wu $a1, $a4, $a3
-; LA32-NEXT:    mul.w $t5, $t1, $a3
-; LA32-NEXT:    add.w $a1, $t5, $a1
-; LA32-NEXT:    sltu $t5, $a1, $t5
-; LA32-NEXT:    mulh.wu $t6, $t1, $a3
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $a4, $t4
-; LA32-NEXT:    add.w $a1, $t6, $a1
-; LA32-NEXT:    sltu $t6, $a1, $t6
-; LA32-NEXT:    mulh.wu $t7, $a4, $t4
-; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    mul.w $t7, $t1, $t4
-; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    add.w $t5, $t1, $t5
+; LA32-NEXT:    mulh.wu $t1, $a6, $a5
+; LA32-NEXT:    mul.w $t6, $t0, $a5
+; LA32-NEXT:    add.w $t1, $t6, $t1
+; LA32-NEXT:    sltu $t6, $t1, $t6
+; LA32-NEXT:    mulh.wu $t7, $t0, $a5
 ; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    sltu $t7, $t6, $t7
-; LA32-NEXT:    mulh.wu $t4, $t1, $t4
-; LA32-NEXT:    add.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t4, $t7
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a7, $a3
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    add.w $t7, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t8, $fp
-; LA32-NEXT:    beq $t7, $t2, .LBB1_2
-; LA32-NEXT:  # %bb.1:
-; LA32-NEXT:    sltu $t5, $t7, $t2
-; LA32-NEXT:  .LBB1_2:
-; LA32-NEXT:    add.w $t5, $t3, $t5
-; LA32-NEXT:    sltu $t2, $t5, $t3
-; LA32-NEXT:    add.w $t4, $t4, $t2
-; LA32-NEXT:    mulh.wu $t2, $a4, $a6
-; LA32-NEXT:    mul.w $t3, $t1, $a6
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    sltu $t3, $t2, $t3
-; LA32-NEXT:    mulh.wu $t8, $t1, $a6
-; LA32-NEXT:    add.w $s0, $t8, $t3
-; LA32-NEXT:    mul.w $t3, $a4, $a5
-; LA32-NEXT:    add.w $t8, $t3, $t2
-; LA32-NEXT:    sltu $t2, $t8, $t3
-; LA32-NEXT:    mulh.wu $t3, $a4, $a5
-; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    mul.w $t7, $a6, $a1
+; LA32-NEXT:    add.w $t1, $t7, $t1
+; LA32-NEXT:    sltu $t7, $t1, $t7
+; LA32-NEXT:    mulh.wu $t8, $a6, $a1
+; LA32-NEXT:    add.w $t7, $t8, $t7
+; LA32-NEXT:    add.w $t7, $t6, $t7
+; LA32-NEXT:    mul.w $t8, $t0, $a1
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    add.w $t7, $t8, $t7
+; LA32-NEXT:    sltu $t8, $t7, $t8
+; LA32-NEXT:    mulh.wu $a1, $t0, $a1
+; LA32-NEXT:    add.w $a1, $a1, $t6
+; LA32-NEXT:    add.w $a1, $a1, $t8
+; LA32-NEXT:    add.w $t8, $t2, $a1
+; LA32-NEXT:    mul.w $t6, $a0, $a5
+; LA32-NEXT:    add.w $a1, $t6, $t7
+; LA32-NEXT:    sltu $t6, $a1, $t6
+; LA32-NEXT:    add.w $t7, $t8, $t6
+; LA32-NEXT:    add.w $t5, $t5, $fp
+; LA32-NEXT:    beq $t7, $t2, .LBB1_4
+; LA32-NEXT:  # %bb.3: # %overflow
+; LA32-NEXT:    sltu $t6, $t7, $t2
+; LA32-NEXT:  .LBB1_4: # %overflow
+; LA32-NEXT:    add.w $t6, $t4, $t6
+; LA32-NEXT:    sltu $t2, $t6, $t4
+; LA32-NEXT:    add.w $t5, $t5, $t2
+; LA32-NEXT:    mulh.wu $t2, $a6, $a7
+; LA32-NEXT:    mul.w $t4, $t0, $a7
+; LA32-NEXT:    add.w $t2, $t4, $t2
+; LA32-NEXT:    sltu $t4, $t2, $t4
+; LA32-NEXT:    mulh.wu $t8, $t0, $a7
+; LA32-NEXT:    add.w $s0, $t8, $t4
+; LA32-NEXT:    mul.w $t4, $a6, $a3
+; LA32-NEXT:    add.w $t8, $t4, $t2
+; LA32-NEXT:    sltu $t2, $t8, $t4
+; LA32-NEXT:    mulh.wu $t4, $a6, $a3
+; LA32-NEXT:    add.w $t2, $t4, $t2
 ; LA32-NEXT:    add.w $t2, $s0, $t2
-; LA32-NEXT:    mul.w $s1, $t1, $a5
+; LA32-NEXT:    mul.w $s1, $t0, $a3
 ; LA32-NEXT:    add.w $s2, $s1, $t2
-; LA32-NEXT:    srai.w $t3, $a5, 31
-; LA32-NEXT:    mul.w $s3, $t3, $a4
+; LA32-NEXT:    srai.w $t4, $a3, 31
+; LA32-NEXT:    mul.w $s3, $t4, $a6
 ; LA32-NEXT:    add.w $fp, $s2, $s3
 ; LA32-NEXT:    sltu $s4, $fp, $s2
 ; LA32-NEXT:    sltu $s1, $s2, $s1
 ; LA32-NEXT:    sltu $t2, $t2, $s0
-; LA32-NEXT:    mulh.wu $s0, $t1, $a5
+; LA32-NEXT:    mulh.wu $s0, $t0, $a3
 ; LA32-NEXT:    add.w $t2, $s0, $t2
 ; LA32-NEXT:    add.w $t2, $t2, $s1
-; LA32-NEXT:    mul.w $t1, $t3, $t1
-; LA32-NEXT:    mulh.wu $s0, $t3, $a4
-; LA32-NEXT:    add.w $t1, $s0, $t1
-; LA32-NEXT:    add.w $t1, $t1, $s3
-; LA32-NEXT:    add.w $s0, $t2, $t1
-; LA32-NEXT:    add.w $t2, $t8, $t7
-; LA32-NEXT:    mul.w $t7, $a4, $a6
-; LA32-NEXT:    add.w $t1, $t7, $t6
-; LA32-NEXT:    sltu $t7, $t1, $t7
-; LA32-NEXT:    add.w $t2, $t2, $t7
-; LA32-NEXT:    add.w $t6, $s0, $s4
-; LA32-NEXT:    beq $t2, $t8, .LBB1_4
-; LA32-NEXT:  # %bb.3:
-; LA32-NEXT:    sltu $t7, $t2, $t8
-; LA32-NEXT:  .LBB1_4:
+; LA32-NEXT:    mul.w $t0, $t4, $t0
+; LA32-NEXT:    mulh.wu $s0, $t4, $a6
+; LA32-NEXT:    add.w $t0, $s0, $t0
+; LA32-NEXT:    add.w $t0, $t0, $s3
+; LA32-NEXT:    add.w $t0, $t2, $t0
+; LA32-NEXT:    add.w $s0, $t8, $t7
+; LA32-NEXT:    mul.w $t7, $a6, $a7
+; LA32-NEXT:    add.w $t2, $t7, $a1
+; LA32-NEXT:    sltu $t7, $t2, $t7
+; LA32-NEXT:    add.w $a1, $s0, $t7
+; LA32-NEXT:    add.w $t0, $t0, $s4
+; LA32-NEXT:    beq $a1, $t8, .LBB1_6
+; LA32-NEXT:  # %bb.5: # %overflow
+; LA32-NEXT:    sltu $t7, $a1, $t8
+; LA32-NEXT:  .LBB1_6: # %overflow
 ; LA32-NEXT:    add.w $t7, $fp, $t7
 ; LA32-NEXT:    sltu $t8, $t7, $fp
-; LA32-NEXT:    add.w $t8, $t6, $t8
-; LA32-NEXT:    add.w $t6, $t4, $t8
-; LA32-NEXT:    add.w $t7, $t5, $t7
-; LA32-NEXT:    sltu $s0, $t7, $t5
-; LA32-NEXT:    add.w $s4, $t6, $s0
-; LA32-NEXT:    mulh.wu $t5, $a7, $a6
-; LA32-NEXT:    mul.w $s1, $t0, $a6
-; LA32-NEXT:    add.w $s3, $s1, $t5
-; LA32-NEXT:    mul.w $fp, $a7, $a5
+; LA32-NEXT:    add.w $t8, $t0, $t8
+; LA32-NEXT:    add.w $t0, $t5, $t8
+; LA32-NEXT:    add.w $t7, $t6, $t7
+; LA32-NEXT:    sltu $s0, $t7, $t6
+; LA32-NEXT:    add.w $s4, $t0, $s0
+; LA32-NEXT:    mulh.wu $t0, $a0, $a7
+; LA32-NEXT:    mul.w $s1, $a4, $a7
+; LA32-NEXT:    add.w $s3, $s1, $t0
+; LA32-NEXT:    mul.w $fp, $a0, $a3
 ; LA32-NEXT:    add.w $s2, $fp, $s3
 ; LA32-NEXT:    add.w $t6, $s2, $s4
-; LA32-NEXT:    mul.w $s5, $a7, $a6
-; LA32-NEXT:    add.w $t5, $s5, $t7
-; LA32-NEXT:    sltu $t7, $t5, $s5
+; LA32-NEXT:    mul.w $s5, $a0, $a7
+; LA32-NEXT:    add.w $t0, $s5, $t7
+; LA32-NEXT:    sltu $t7, $t0, $s5
 ; LA32-NEXT:    add.w $t6, $t6, $t7
-; LA32-NEXT:    beq $t6, $s2, .LBB1_6
-; LA32-NEXT:  # %bb.5:
+; LA32-NEXT:    beq $t6, $s2, .LBB1_8
+; LA32-NEXT:  # %bb.7: # %overflow
 ; LA32-NEXT:    sltu $t7, $t6, $s2
-; LA32-NEXT:  .LBB1_6:
-; LA32-NEXT:    beq $s4, $t4, .LBB1_8
-; LA32-NEXT:  # %bb.7:
-; LA32-NEXT:    sltu $s0, $s4, $t4
-; LA32-NEXT:  .LBB1_8:
-; LA32-NEXT:    srai.w $t4, $t4, 31
+; LA32-NEXT:  .LBB1_8: # %overflow
+; LA32-NEXT:    beq $s4, $t5, .LBB1_10
+; LA32-NEXT:  # %bb.9: # %overflow
+; LA32-NEXT:    sltu $s0, $s4, $t5
+; LA32-NEXT:  .LBB1_10: # %overflow
+; LA32-NEXT:    srai.w $t5, $t5, 31
 ; LA32-NEXT:    srai.w $t8, $t8, 31
-; LA32-NEXT:    add.w $t8, $t4, $t8
+; LA32-NEXT:    add.w $t8, $t5, $t8
 ; LA32-NEXT:    add.w $s0, $t8, $s0
 ; LA32-NEXT:    sltu $s1, $s3, $s1
-; LA32-NEXT:    mulh.wu $s3, $t0, $a6
+; LA32-NEXT:    mulh.wu $s3, $a4, $a7
 ; LA32-NEXT:    add.w $s1, $s3, $s1
 ; LA32-NEXT:    sltu $fp, $s2, $fp
-; LA32-NEXT:    mulh.wu $s2, $a7, $a5
+; LA32-NEXT:    mulh.wu $s2, $a0, $a3
 ; LA32-NEXT:    add.w $fp, $s2, $fp
 ; LA32-NEXT:    add.w $fp, $s1, $fp
-; LA32-NEXT:    mul.w $s2, $t0, $a5
+; LA32-NEXT:    mul.w $s2, $a4, $a3
 ; LA32-NEXT:    add.w $s3, $s2, $fp
-; LA32-NEXT:    mul.w $s4, $a6, $a0
-; LA32-NEXT:    mul.w $s5, $t3, $a7
+; LA32-NEXT:    mul.w $s4, $a7, $t3
+; LA32-NEXT:    mul.w $s5, $t4, $a0
 ; LA32-NEXT:    add.w $s6, $s5, $s4
 ; LA32-NEXT:    add.w $s7, $s3, $s6
 ; LA32-NEXT:    add.w $s8, $s7, $s0
 ; LA32-NEXT:    add.w $t7, $s8, $t7
 ; LA32-NEXT:    sltu $ra, $t7, $s8
-; LA32-NEXT:    sltu $t4, $t8, $t4
-; LA32-NEXT:    add.w $t4, $t8, $t4
+; LA32-NEXT:    sltu $t5, $t8, $t5
+; LA32-NEXT:    add.w $t5, $t8, $t5
 ; LA32-NEXT:    sltu $t8, $s0, $t8
-; LA32-NEXT:    add.w $t4, $t4, $t8
+; LA32-NEXT:    add.w $t5, $t5, $t8
 ; LA32-NEXT:    sltu $t8, $s7, $s3
 ; LA32-NEXT:    sltu $s0, $s3, $s2
 ; LA32-NEXT:    sltu $fp, $fp, $s1
-; LA32-NEXT:    mulh.wu $s1, $t0, $a5
+; LA32-NEXT:    mulh.wu $s1, $a4, $a3
 ; LA32-NEXT:    add.w $fp, $s1, $fp
 ; LA32-NEXT:    add.w $fp, $fp, $s0
-; LA32-NEXT:    mulh.wu $a6, $a6, $a0
-; LA32-NEXT:    add.w $a6, $a6, $s4
-; LA32-NEXT:    mul.w $a0, $a5, $a0
-; LA32-NEXT:    add.w $a0, $a6, $a0
-; LA32-NEXT:    mul.w $a5, $t3, $t0
-; LA32-NEXT:    mulh.wu $a6, $t3, $a7
-; LA32-NEXT:    add.w $a5, $a6, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s5
-; LA32-NEXT:    add.w $a0, $a5, $a0
-; LA32-NEXT:    sltu $a5, $s6, $s5
-; LA32-NEXT:    add.w $a0, $a0, $a5
+; LA32-NEXT:    mulh.wu $a7, $a7, $t3
+; LA32-NEXT:    add.w $a7, $a7, $s4
+; LA32-NEXT:    mul.w $a3, $a3, $t3
+; LA32-NEXT:    add.w $a3, $a7, $a3
+; LA32-NEXT:    mul.w $a4, $t4, $a4
+; LA32-NEXT:    mulh.wu $a0, $t4, $a0
+; LA32-NEXT:    add.w $a0, $a0, $a4
+; LA32-NEXT:    add.w $a0, $a0, $s5
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    sltu $a3, $s6, $s5
+; LA32-NEXT:    add.w $a0, $a0, $a3
 ; LA32-NEXT:    add.w $a0, $fp, $a0
 ; LA32-NEXT:    add.w $a0, $a0, $t8
-; LA32-NEXT:    add.w $a0, $a0, $t4
-; LA32-NEXT:    sltu $a5, $s8, $s7
-; LA32-NEXT:    add.w $a0, $a0, $a5
+; LA32-NEXT:    add.w $a0, $a0, $t5
+; LA32-NEXT:    sltu $a3, $s8, $s7
+; LA32-NEXT:    add.w $a0, $a0, $a3
 ; LA32-NEXT:    add.w $a0, $a0, $ra
-; LA32-NEXT:    srai.w $a5, $t2, 31
-; LA32-NEXT:    xor $a0, $a0, $a5
-; LA32-NEXT:    xor $a6, $t6, $a5
-; LA32-NEXT:    or $a0, $a6, $a0
-; LA32-NEXT:    xor $a6, $t7, $a5
-; LA32-NEXT:    xor $a5, $t5, $a5
-; LA32-NEXT:    or $a5, $a5, $a6
-; LA32-NEXT:    or $a0, $a5, $a0
-; LA32-NEXT:    sltu $a0, $zero, $a0
-; LA32-NEXT:    mul.w $a3, $a4, $a3
-; LA32-NEXT:    st.w $a3, $a2, 0
-; LA32-NEXT:    st.w $a1, $a2, 4
-; LA32-NEXT:    st.w $t1, $a2, 8
-; LA32-NEXT:    st.w $t2, $a2, 12
+; LA32-NEXT:    srai.w $a3, $a1, 31
+; LA32-NEXT:    xor $a0, $a0, $a3
+; LA32-NEXT:    xor $a4, $t6, $a3
+; LA32-NEXT:    or $a0, $a4, $a0
+; LA32-NEXT:    xor $a4, $t7, $a3
+; LA32-NEXT:    xor $a3, $t0, $a3
+; LA32-NEXT:    or $a3, $a3, $a4
+; LA32-NEXT:    or $a0, $a3, $a0
+; LA32-NEXT:    sltu $t3, $zero, $a0
+; LA32-NEXT:    b .LBB1_17
+; LA32-NEXT:  .LBB1_11: # %overflow.no.lhs
+; LA32-NEXT:    xor $t2, $a7, $t1
+; LA32-NEXT:    xor $t1, $a3, $t1
+; LA32-NEXT:    or $t1, $t2, $t1
+; LA32-NEXT:    beq $t1, $zero, .LBB1_16
+; LA32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_18
+; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t1, $a0
+; LA32-NEXT:    move $t3, $a4
+; LA32-NEXT:    move $t2, $a6
+; LA32-NEXT:    move $t4, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_19
+; LA32-NEXT:    b .LBB1_20
+; LA32-NEXT:  .LBB1_14: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_35
+; LA32-NEXT:  # %bb.15: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t1, $a7
+; LA32-NEXT:    move $t3, $a3
+; LA32-NEXT:    move $t2, $a5
+; LA32-NEXT:    move $t4, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_36
+; LA32-NEXT:    b .LBB1_37
+; LA32-NEXT:  .LBB1_16: # %overflow.no
+; LA32-NEXT:    move $t3, $zero
+; LA32-NEXT:    mulh.wu $t1, $a6, $a5
+; LA32-NEXT:    mul.w $t2, $t0, $a5
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t4, $t0, $a5
+; LA32-NEXT:    add.w $t4, $t4, $t2
+; LA32-NEXT:    mul.w $t2, $a6, $a1
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t5, $a6, $a1
+; LA32-NEXT:    add.w $t2, $t5, $t2
+; LA32-NEXT:    add.w $t5, $t4, $t2
+; LA32-NEXT:    mul.w $t6, $t0, $a1
+; LA32-NEXT:    add.w $t7, $t6, $t5
+; LA32-NEXT:    mul.w $t2, $a5, $a0
+; LA32-NEXT:    mul.w $t8, $a7, $a6
+; LA32-NEXT:    add.w $fp, $t8, $t2
+; LA32-NEXT:    add.w $t2, $t7, $fp
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $t7, $t2, $t7
+; LA32-NEXT:    sltu $t4, $t5, $t4
+; LA32-NEXT:    mulh.wu $t5, $t0, $a1
+; LA32-NEXT:    add.w $t4, $t5, $t4
+; LA32-NEXT:    add.w $t4, $t4, $t6
+; LA32-NEXT:    mul.w $t0, $a7, $t0
+; LA32-NEXT:    mulh.wu $a7, $a7, $a6
+; LA32-NEXT:    add.w $a7, $a7, $t0
+; LA32-NEXT:    mul.w $a3, $a3, $a6
+; LA32-NEXT:    add.w $a3, $a7, $a3
+; LA32-NEXT:    mulh.wu $a7, $a5, $a0
+; LA32-NEXT:    mul.w $a4, $a5, $a4
+; LA32-NEXT:    add.w $a4, $a7, $a4
+; LA32-NEXT:    mul.w $a0, $a1, $a0
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    add.w $a0, $a3, $a0
+; LA32-NEXT:    sltu $a1, $fp, $t8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    add.w $a0, $t4, $a0
+; LA32-NEXT:    add.w $a1, $a0, $t7
+; LA32-NEXT:  .LBB1_17: # %overflow.res
+; LA32-NEXT:    mul.w $a0, $a6, $a5
+; LA32-NEXT:    b .LBB1_53
+; LA32-NEXT:  .LBB1_18:
+; LA32-NEXT:    sub.w $t2, $zero, $a0
+; LA32-NEXT:    or $t1, $a6, $t0
+; LA32-NEXT:    sltu $t3, $zero, $t1
+; LA32-NEXT:    sub.w $t1, $t2, $t3
+; LA32-NEXT:    sltu $t2, $t2, $t3
+; LA32-NEXT:    sltu $t3, $zero, $a0
+; LA32-NEXT:    add.w $t3, $a4, $t3
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    sub.w $t3, $zero, $t2
+; LA32-NEXT:    sub.w $t2, $zero, $a6
+; LA32-NEXT:    sltu $t4, $zero, $a6
+; LA32-NEXT:    add.w $t4, $t0, $t4
+; LA32-NEXT:    sub.w $t4, $zero, $t4
+; LA32-NEXT:    bltz $a4, .LBB1_20
+; LA32-NEXT:  .LBB1_19: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t3, $a4
+; LA32-NEXT:    move $t1, $a0
+; LA32-NEXT:  .LBB1_20: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_24
+; LA32-NEXT:  # %bb.21: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t4, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_25
+; LA32-NEXT:  .LBB1_22: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_26
+; LA32-NEXT:  .LBB1_23: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a0, $a7
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $t0, $a5
+; LA32-NEXT:    move $t5, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_27
+; LA32-NEXT:    b .LBB1_28
+; LA32-NEXT:  .LBB1_24: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_22
+; LA32-NEXT:  .LBB1_25: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t2, $a6
+; LA32-NEXT:    bgez $a3, .LBB1_23
+; LA32-NEXT:  .LBB1_26:
+; LA32-NEXT:    sub.w $a6, $zero, $a7
+; LA32-NEXT:    or $a0, $a5, $a1
+; LA32-NEXT:    sltu $t0, $zero, $a0
+; LA32-NEXT:    sub.w $a0, $a6, $t0
+; LA32-NEXT:    sltu $a6, $a6, $t0
+; LA32-NEXT:    sltu $t0, $zero, $a7
+; LA32-NEXT:    add.w $t0, $a3, $t0
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    sub.w $t0, $zero, $a5
+; LA32-NEXT:    sltu $t5, $zero, $a5
+; LA32-NEXT:    add.w $t5, $a1, $t5
+; LA32-NEXT:    sub.w $t5, $zero, $t5
+; LA32-NEXT:    bltz $a3, .LBB1_28
+; LA32-NEXT:  .LBB1_27: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $a0, $a7
+; LA32-NEXT:  .LBB1_28: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_30
+; LA32-NEXT:  # %bb.29: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t5, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_31
+; LA32-NEXT:    b .LBB1_32
+; LA32-NEXT:  .LBB1_30: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_32
+; LA32-NEXT:  .LBB1_31: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t0, $a5
+; LA32-NEXT:  .LBB1_32: # %overflow.no.lhs.only
+; LA32-NEXT:    slti $a1, $a4, 0
+; LA32-NEXT:    slti $a3, $a3, 0
+; LA32-NEXT:    mulh.wu $a4, $t2, $t0
+; LA32-NEXT:    mul.w $a5, $t4, $t0
+; LA32-NEXT:    add.w $a4, $a5, $a4
+; LA32-NEXT:    sltu $a5, $a4, $a5
+; LA32-NEXT:    mulh.wu $a7, $t4, $t0
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    mul.w $a7, $t2, $t5
+; LA32-NEXT:    add.w $a4, $a7, $a4
+; LA32-NEXT:    sltu $a7, $a4, $a7
+; LA32-NEXT:    mulh.wu $t6, $t2, $t5
+; LA32-NEXT:    add.w $a7, $t6, $a7
+; LA32-NEXT:    add.w $a7, $a5, $a7
+; LA32-NEXT:    mul.w $t6, $t4, $t5
+; LA32-NEXT:    add.w $t7, $t6, $a7
+; LA32-NEXT:    mul.w $t8, $t0, $t1
+; LA32-NEXT:    add.w $t8, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t8, $t7
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $a5, $a7, $a5
+; LA32-NEXT:    mulh.wu $a7, $t4, $t5
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    add.w $a5, $a5, $t6
+; LA32-NEXT:    mulh.wu $a7, $t0, $t1
+; LA32-NEXT:    mul.w $t6, $t0, $t3
+; LA32-NEXT:    add.w $a7, $a7, $t6
+; LA32-NEXT:    mul.w $t5, $t5, $t1
+; LA32-NEXT:    add.w $a7, $a7, $t5
+; LA32-NEXT:    add.w $a5, $a5, $a7
+; LA32-NEXT:    add.w $a7, $a5, $fp
+; LA32-NEXT:    mul.w $a5, $t2, $t0
+; LA32-NEXT:    mulh.wu $t0, $t2, $a0
+; LA32-NEXT:    mul.w $t5, $t4, $a0
+; LA32-NEXT:    add.w $t0, $t5, $t0
+; LA32-NEXT:    sltu $t5, $t0, $t5
+; LA32-NEXT:    mulh.wu $t6, $t4, $a0
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $t2, $a6
+; LA32-NEXT:    add.w $t7, $t6, $t0
+; LA32-NEXT:    sltu $t0, $t7, $t6
+; LA32-NEXT:    mulh.wu $t6, $t2, $a6
+; LA32-NEXT:    add.w $t0, $t6, $t0
+; LA32-NEXT:    add.w $t6, $t5, $t0
+; LA32-NEXT:    mul.w $fp, $t4, $a6
+; LA32-NEXT:    add.w $s0, $fp, $t6
+; LA32-NEXT:    mul.w $t0, $a0, $t1
+; LA32-NEXT:    add.w $t0, $s0, $t0
+; LA32-NEXT:    sltu $s1, $t0, $s0
+; LA32-NEXT:    sltu $fp, $s0, $fp
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    mulh.wu $t4, $t4, $a6
+; LA32-NEXT:    add.w $t4, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t4, $fp
+; LA32-NEXT:    mulh.wu $t5, $a0, $t1
+; LA32-NEXT:    mul.w $t3, $a0, $t3
+; LA32-NEXT:    add.w $t3, $t5, $t3
+; LA32-NEXT:    mul.w $a6, $a6, $t1
+; LA32-NEXT:    add.w $a6, $t3, $a6
+; LA32-NEXT:    add.w $t3, $t4, $a6
+; LA32-NEXT:    mul.w $a0, $t2, $a0
+; LA32-NEXT:    add.w $t2, $a7, $t7
+; LA32-NEXT:    add.w $a6, $t8, $a0
+; LA32-NEXT:    sltu $t1, $a6, $t8
+; LA32-NEXT:    add.w $t2, $t2, $t1
+; LA32-NEXT:    add.w $a0, $t3, $s1
+; LA32-NEXT:    beq $t2, $a7, .LBB1_34
+; LA32-NEXT:  # %bb.33: # %overflow.no.lhs.only
+; LA32-NEXT:    sltu $t1, $t2, $a7
+; LA32-NEXT:  .LBB1_34: # %overflow.no.lhs.only
+; LA32-NEXT:    add.w $a7, $t0, $t1
+; LA32-NEXT:    sltu $t0, $a7, $t0
+; LA32-NEXT:    add.w $t0, $a0, $t0
+; LA32-NEXT:    xor $a1, $a3, $a1
+; LA32-NEXT:    sub.w $a3, $zero, $a1
+; LA32-NEXT:    xor $a4, $a4, $a3
+; LA32-NEXT:    xor $a5, $a5, $a3
+; LA32-NEXT:    add.w $a0, $a5, $a1
+; LA32-NEXT:    sltu $a5, $a0, $a5
+; LA32-NEXT:    add.w $t1, $a4, $a5
+; LA32-NEXT:    sltui $a4, $t1, 1
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    and $a4, $a4, $a1
+; LA32-NEXT:    xor $a1, $t2, $a3
+; LA32-NEXT:    xor $a5, $a6, $a3
+; LA32-NEXT:    add.w $t2, $a5, $a4
+; LA32-NEXT:    sltu $a5, $t2, $a5
+; LA32-NEXT:    add.w $a1, $a1, $a5
+; LA32-NEXT:    sltui $a5, $a1, 1
+; LA32-NEXT:    sltu $a4, $t2, $a4
+; LA32-NEXT:    and $a4, $a5, $a4
+; LA32-NEXT:    xor $a5, $t0, $a3
+; LA32-NEXT:    xor $a3, $a7, $a3
+; LA32-NEXT:    add.w $a4, $a3, $a4
+; LA32-NEXT:    sltu $a3, $a4, $a3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    or $a3, $a4, $a3
+; LA32-NEXT:    b .LBB1_52
+; LA32-NEXT:  .LBB1_35:
+; LA32-NEXT:    sub.w $t2, $zero, $a7
+; LA32-NEXT:    or $t1, $a5, $a1
+; LA32-NEXT:    sltu $t3, $zero, $t1
+; LA32-NEXT:    sub.w $t1, $t2, $t3
+; LA32-NEXT:    sltu $t2, $t2, $t3
+; LA32-NEXT:    sltu $t3, $zero, $a7
+; LA32-NEXT:    add.w $t3, $a3, $t3
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    sub.w $t3, $zero, $t2
+; LA32-NEXT:    sub.w $t2, $zero, $a5
+; LA32-NEXT:    sltu $t4, $zero, $a5
+; LA32-NEXT:    add.w $t4, $a1, $t4
+; LA32-NEXT:    sub.w $t4, $zero, $t4
+; LA32-NEXT:    bltz $a3, .LBB1_37
+; LA32-NEXT:  .LBB1_36: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t3, $a3
+; LA32-NEXT:    move $t1, $a7
+; LA32-NEXT:  .LBB1_37: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_41
+; LA32-NEXT:  # %bb.38: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t4, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_42
+; LA32-NEXT:  .LBB1_39: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_43
+; LA32-NEXT:  .LBB1_40: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:    move $a5, $a4
+; LA32-NEXT:    move $a7, $a6
+; LA32-NEXT:    move $t5, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_44
+; LA32-NEXT:    b .LBB1_45
+; LA32-NEXT:  .LBB1_41: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_39
+; LA32-NEXT:  .LBB1_42: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t2, $a5
+; LA32-NEXT:    bgez $a4, .LBB1_40
+; LA32-NEXT:  .LBB1_43:
+; LA32-NEXT:    sub.w $a5, $zero, $a0
+; LA32-NEXT:    or $a1, $a6, $t0
+; LA32-NEXT:    sltu $a7, $zero, $a1
+; LA32-NEXT:    sub.w $a1, $a5, $a7
+; LA32-NEXT:    sltu $a5, $a5, $a7
+; LA32-NEXT:    sltu $a7, $zero, $a0
+; LA32-NEXT:    add.w $a7, $a4, $a7
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    sub.w $a5, $zero, $a5
+; LA32-NEXT:    sub.w $a7, $zero, $a6
+; LA32-NEXT:    sltu $t5, $zero, $a6
+; LA32-NEXT:    add.w $t5, $t0, $t5
+; LA32-NEXT:    sub.w $t5, $zero, $t5
+; LA32-NEXT:    bltz $a4, .LBB1_45
+; LA32-NEXT:  .LBB1_44: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a5, $a4
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:  .LBB1_45: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_47
+; LA32-NEXT:  # %bb.46: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t5, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_48
+; LA32-NEXT:    b .LBB1_49
+; LA32-NEXT:  .LBB1_47: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_49
+; LA32-NEXT:  .LBB1_48: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a7, $a6
+; LA32-NEXT:  .LBB1_49: # %overflow.no.rhs.only
+; LA32-NEXT:    slti $a0, $a3, 0
+; LA32-NEXT:    slti $a3, $a4, 0
+; LA32-NEXT:    mulh.wu $a4, $t2, $a7
+; LA32-NEXT:    mul.w $a6, $t4, $a7
+; LA32-NEXT:    add.w $a4, $a6, $a4
+; LA32-NEXT:    sltu $a6, $a4, $a6
+; LA32-NEXT:    mulh.wu $t0, $t4, $a7
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    mul.w $t0, $t2, $t5
+; LA32-NEXT:    add.w $a4, $t0, $a4
+; LA32-NEXT:    sltu $t0, $a4, $t0
+; LA32-NEXT:    mulh.wu $t6, $t2, $t5
+; LA32-NEXT:    add.w $t0, $t6, $t0
+; LA32-NEXT:    add.w $t0, $a6, $t0
+; LA32-NEXT:    mul.w $t6, $t4, $t5
+; LA32-NEXT:    add.w $t7, $t6, $t0
+; LA32-NEXT:    mul.w $t8, $a7, $t1
+; LA32-NEXT:    add.w $t8, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t8, $t7
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $a6, $t0, $a6
+; LA32-NEXT:    mulh.wu $t0, $t4, $t5
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    add.w $a6, $a6, $t6
+; LA32-NEXT:    mulh.wu $t0, $a7, $t1
+; LA32-NEXT:    mul.w $t6, $a7, $t3
+; LA32-NEXT:    add.w $t0, $t0, $t6
+; LA32-NEXT:    mul.w $t5, $t5, $t1
+; LA32-NEXT:    add.w $t0, $t0, $t5
+; LA32-NEXT:    add.w $a6, $a6, $t0
+; LA32-NEXT:    add.w $t0, $a6, $fp
+; LA32-NEXT:    mul.w $a6, $t2, $a7
+; LA32-NEXT:    mulh.wu $a7, $t2, $a1
+; LA32-NEXT:    mul.w $t5, $t4, $a1
+; LA32-NEXT:    add.w $a7, $t5, $a7
+; LA32-NEXT:    sltu $t5, $a7, $t5
+; LA32-NEXT:    mulh.wu $t6, $t4, $a1
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $t2, $a5
+; LA32-NEXT:    add.w $t7, $t6, $a7
+; LA32-NEXT:    sltu $a7, $t7, $t6
+; LA32-NEXT:    mulh.wu $t6, $t2, $a5
+; LA32-NEXT:    add.w $a7, $t6, $a7
+; LA32-NEXT:    add.w $t6, $t5, $a7
+; LA32-NEXT:    mul.w $fp, $t4, $a5
+; LA32-NEXT:    add.w $s0, $fp, $t6
+; LA32-NEXT:    mul.w $a7, $a1, $t1
+; LA32-NEXT:    add.w $a7, $s0, $a7
+; LA32-NEXT:    sltu $s1, $a7, $s0
+; LA32-NEXT:    sltu $fp, $s0, $fp
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    mulh.wu $t4, $t4, $a5
+; LA32-NEXT:    add.w $t4, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t4, $fp
+; LA32-NEXT:    mulh.wu $t5, $a1, $t1
+; LA32-NEXT:    mul.w $t3, $a1, $t3
+; LA32-NEXT:    add.w $t3, $t5, $t3
+; LA32-NEXT:    mul.w $a5, $a5, $t1
+; LA32-NEXT:    add.w $a5, $t3, $a5
+; LA32-NEXT:    add.w $t1, $t4, $a5
+; LA32-NEXT:    mul.w $a1, $t2, $a1
+; LA32-NEXT:    add.w $a5, $t0, $t7
+; LA32-NEXT:    add.w $a1, $t8, $a1
+; LA32-NEXT:    sltu $t2, $a1, $t8
+; LA32-NEXT:    add.w $a5, $a5, $t2
+; LA32-NEXT:    add.w $t1, $t1, $s1
+; LA32-NEXT:    beq $a5, $t0, .LBB1_51
+; LA32-NEXT:  # %bb.50: # %overflow.no.rhs.only
+; LA32-NEXT:    sltu $t2, $a5, $t0
+; LA32-NEXT:  .LBB1_51: # %overflow.no.rhs.only
+; LA32-NEXT:    add.w $t0, $a7, $t2
+; LA32-NEXT:    sltu $a7, $t0, $a7
+; LA32-NEXT:    add.w $a7, $t1, $a7
+; LA32-NEXT:    xor $a3, $a0, $a3
+; LA32-NEXT:    sub.w $t3, $zero, $a3
+; LA32-NEXT:    xor $a4, $a4, $t3
+; LA32-NEXT:    xor $a6, $a6, $t3
+; LA32-NEXT:    add.w $a0, $a6, $a3
+; LA32-NEXT:    sltu $a6, $a0, $a6
+; LA32-NEXT:    add.w $t1, $a4, $a6
+; LA32-NEXT:    sltui $a4, $t1, 1
+; LA32-NEXT:    sltu $a3, $a0, $a3
+; LA32-NEXT:    and $a3, $a4, $a3
+; LA32-NEXT:    xor $a4, $a5, $t3
+; LA32-NEXT:    xor $a1, $a1, $t3
+; LA32-NEXT:    add.w $t2, $a1, $a3
+; LA32-NEXT:    sltu $a1, $t2, $a1
+; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    sltui $a4, $a1, 1
+; LA32-NEXT:    sltu $a3, $t2, $a3
+; LA32-NEXT:    and $a3, $a4, $a3
+; LA32-NEXT:    xor $a4, $a7, $t3
+; LA32-NEXT:    xor $a5, $t0, $t3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    sltu $a5, $a3, $a5
+; LA32-NEXT:    add.w $a4, $a4, $a5
+; LA32-NEXT:    or $a3, $a3, $a4
+; LA32-NEXT:  .LBB1_52: # %overflow.res
+; LA32-NEXT:    sltu $t3, $zero, $a3
+; LA32-NEXT:  .LBB1_53: # %overflow.res
+; LA32-NEXT:    st.w $a0, $a2, 0
+; LA32-NEXT:    st.w $t1, $a2, 4
+; LA32-NEXT:    st.w $t2, $a2, 8
+; LA32-NEXT:    andi $a0, $t3, 1
+; LA32-NEXT:    st.w $a1, $a2, 12
 ; LA32-NEXT:    ld.w $s8, $sp, 4 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s7, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
@@ -295,7 +838,13 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi128:
-; LA64:       # %bb.0:
+; LA64:       # %bb.0: # %overflow.entry
+; LA64-NEXT:    srai.d $a6, $a0, 63
+; LA64-NEXT:    srai.d $a5, $a2, 63
+; LA64-NEXT:    beq $a1, $a6, .LBB1_3
+; LA64-NEXT:  # %bb.1: # %overflow.lhs
+; LA64-NEXT:    beq $a3, $a5, .LBB1_5
+; LA64-NEXT:  # %bb.2: # %overflow
 ; LA64-NEXT:    mulh.du $a5, $a0, $a2
 ; LA64-NEXT:    mul.d $a6, $a1, $a2
 ; LA64-NEXT:    add.d $a5, $a6, $a5
@@ -329,11 +878,129 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA64-NEXT:    xor $a1, $a1, $a6
 ; LA64-NEXT:    xor $a3, $a3, $a6
 ; LA64-NEXT:    or $a1, $a3, $a1
-; LA64-NEXT:    sltu $a1, $zero, $a1
+; LA64-NEXT:    sltu $a6, $zero, $a1
+; LA64-NEXT:    b .LBB1_8
+; LA64-NEXT:  .LBB1_3: # %overflow.no.lhs
+; LA64-NEXT:    beq $a3, $a5, .LBB1_7
+; LA64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; LA64-NEXT:    slti $a5, $a1, 0
+; LA64-NEXT:    masknez $a6, $a0, $a5
+; LA64-NEXT:    sub.d $a7, $zero, $a0
+; LA64-NEXT:    maskeqz $a7, $a7, $a5
+; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    masknez $t0, $a1, $a5
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    sub.d $a0, $zero, $a0
+; LA64-NEXT:    maskeqz $a0, $a0, $a5
+; LA64-NEXT:    or $a0, $a0, $t0
+; LA64-NEXT:    maskeqz $a0, $a0, $a5
+; LA64-NEXT:    or $a0, $a0, $t0
+; LA64-NEXT:    maskeqz $a1, $a7, $a5
+; LA64-NEXT:    or $a1, $a1, $a6
+; LA64-NEXT:    slti $a6, $a3, 0
+; LA64-NEXT:    masknez $a7, $a2, $a6
+; LA64-NEXT:    sub.d $t0, $zero, $a2
+; LA64-NEXT:    maskeqz $t0, $t0, $a6
+; LA64-NEXT:    or $t0, $t0, $a7
+; LA64-NEXT:    masknez $t1, $a3, $a6
+; LA64-NEXT:    sltu $a2, $zero, $a2
+; LA64-NEXT:    add.d $a2, $a3, $a2
+; LA64-NEXT:    sub.d $a2, $zero, $a2
+; LA64-NEXT:    maskeqz $a2, $a2, $a6
+; LA64-NEXT:    or $a2, $a2, $t1
+; LA64-NEXT:    maskeqz $a2, $a2, $a6
+; LA64-NEXT:    or $a2, $a2, $t1
+; LA64-NEXT:    maskeqz $a3, $t0, $a6
+; LA64-NEXT:    or $a3, $a3, $a7
+; LA64-NEXT:    mulh.du $a7, $a1, $a3
+; LA64-NEXT:    mul.d $t0, $a0, $a3
+; LA64-NEXT:    add.d $a7, $a7, $t0
+; LA64-NEXT:    mul.d $a3, $a1, $a3
+; LA64-NEXT:    mul.d $a0, $a0, $a2
+; LA64-NEXT:    mulh.du $t0, $a1, $a2
+; LA64-NEXT:    add.d $a0, $t0, $a0
+; LA64-NEXT:    mul.d $a1, $a1, $a2
+; LA64-NEXT:    add.d $a1, $a7, $a1
+; LA64-NEXT:    sltu $a2, $a1, $a7
+; LA64-NEXT:    add.d $a2, $a0, $a2
+; LA64-NEXT:    xor $a5, $a6, $a5
+; LA64-NEXT:    sub.d $a6, $zero, $a5
+; LA64-NEXT:    xor $a0, $a3, $a6
+; LA64-NEXT:    add.d $a0, $a0, $a5
+; LA64-NEXT:    sltu $a3, $a0, $a5
+; LA64-NEXT:    xor $a1, $a1, $a6
+; LA64-NEXT:    add.d $a5, $a1, $a3
+; LA64-NEXT:    sltu $a1, $a5, $a3
+; LA64-NEXT:    b .LBB1_6
+; LA64-NEXT:  .LBB1_5: # %overflow.no.rhs.only
+; LA64-NEXT:    slti $a5, $a3, 0
+; LA64-NEXT:    masknez $a6, $a2, $a5
+; LA64-NEXT:    sub.d $a7, $zero, $a2
+; LA64-NEXT:    maskeqz $a7, $a7, $a5
+; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    masknez $t0, $a3, $a5
+; LA64-NEXT:    sltu $a2, $zero, $a2
+; LA64-NEXT:    add.d $a2, $a3, $a2
+; LA64-NEXT:    sub.d $a2, $zero, $a2
+; LA64-NEXT:    maskeqz $a2, $a2, $a5
+; LA64-NEXT:    or $a2, $a2, $t0
+; LA64-NEXT:    maskeqz $a2, $a2, $a5
+; LA64-NEXT:    or $a2, $a2, $t0
+; LA64-NEXT:    maskeqz $a3, $a7, $a5
+; LA64-NEXT:    or $a3, $a3, $a6
+; LA64-NEXT:    slti $a6, $a1, 0
+; LA64-NEXT:    masknez $a7, $a0, $a6
+; LA64-NEXT:    sub.d $t0, $zero, $a0
+; LA64-NEXT:    maskeqz $t0, $t0, $a6
+; LA64-NEXT:    or $t0, $t0, $a7
+; LA64-NEXT:    masknez $t1, $a1, $a6
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    sub.d $a0, $zero, $a0
+; LA64-NEXT:    maskeqz $a0, $a0, $a6
+; LA64-NEXT:    or $a0, $a0, $t1
+; LA64-NEXT:    maskeqz $a0, $a0, $a6
+; LA64-NEXT:    or $a0, $a0, $t1
+; LA64-NEXT:    maskeqz $a1, $t0, $a6
+; LA64-NEXT:    or $a1, $a1, $a7
+; LA64-NEXT:    mulh.du $a7, $a3, $a1
+; LA64-NEXT:    mul.d $t0, $a2, $a1
+; LA64-NEXT:    add.d $a7, $a7, $t0
+; LA64-NEXT:    mul.d $a1, $a3, $a1
+; LA64-NEXT:    mul.d $a2, $a2, $a0
+; LA64-NEXT:    mulh.du $t0, $a3, $a0
+; LA64-NEXT:    add.d $a2, $t0, $a2
+; LA64-NEXT:    mul.d $a0, $a3, $a0
+; LA64-NEXT:    add.d $a3, $a7, $a0
+; LA64-NEXT:    sltu $a0, $a3, $a7
+; LA64-NEXT:    add.d $a2, $a2, $a0
+; LA64-NEXT:    xor $a5, $a5, $a6
+; LA64-NEXT:    sub.d $a6, $zero, $a5
+; LA64-NEXT:    xor $a0, $a1, $a6
+; LA64-NEXT:    add.d $a0, $a0, $a5
+; LA64-NEXT:    sltu $a1, $a0, $a5
+; LA64-NEXT:    xor $a3, $a3, $a6
+; LA64-NEXT:    add.d $a5, $a3, $a1
+; LA64-NEXT:    sltu $a1, $a5, $a1
+; LA64-NEXT:  .LBB1_6: # %overflow.res
+; LA64-NEXT:    xor $a2, $a2, $a6
+; LA64-NEXT:    add.d $a1, $a2, $a1
+; LA64-NEXT:    sltu $a6, $zero, $a1
+; LA64-NEXT:    b .LBB1_9
+; LA64-NEXT:  .LBB1_7: # %overflow.no
+; LA64-NEXT:    move $a6, $zero
+; LA64-NEXT:    mulh.du $a5, $a0, $a2
+; LA64-NEXT:    mul.d $a3, $a0, $a3
+; LA64-NEXT:    add.d $a3, $a5, $a3
+; LA64-NEXT:    mul.d $a1, $a1, $a2
+; LA64-NEXT:    add.d $a5, $a3, $a1
+; LA64-NEXT:  .LBB1_8: # %overflow.res
 ; LA64-NEXT:    mul.d $a0, $a0, $a2
+; LA64-NEXT:  .LBB1_9: # %overflow.res
 ; LA64-NEXT:    st.d $a0, $a4, 0
+; LA64-NEXT:    andi $a0, $a6, 1
 ; LA64-NEXT:    st.d $a5, $a4, 8
-; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
index f573fdab1b153..5bebf54c3c1a0 100644
--- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
@@ -4,136 +4,343 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; PPC64-LABEL: muloti_test:
-; PPC64:       # %bb.0: # %start
-; PPC64-NEXT:    addic 9, 5, -1
-; PPC64-NEXT:    mulld 10, 5, 4
-; PPC64-NEXT:    mulld 11, 3, 6
-; PPC64-NEXT:    subfe 9, 9, 5
-; PPC64-NEXT:    add 10, 11, 10
-; PPC64-NEXT:    addic 11, 3, -1
-; PPC64-NEXT:    mulhdu 8, 3, 6
-; PPC64-NEXT:    subfe 3, 11, 3
-; PPC64-NEXT:    and 3, 3, 9
-; PPC64-NEXT:    addic 9, 8, -1
-; PPC64-NEXT:    subfe 8, 9, 8
-; PPC64-NEXT:    or 3, 3, 8
-; PPC64-NEXT:    mulhdu 5, 5, 4
-; PPC64-NEXT:    addic 8, 5, -1
-; PPC64-NEXT:    subfe 5, 8, 5
-; PPC64-NEXT:    li 7, 0
-; PPC64-NEXT:    or 5, 3, 5
-; PPC64-NEXT:    mulhdu 8, 4, 6
-; PPC64-NEXT:    addc 3, 8, 10
-; PPC64-NEXT:    addze 7, 7
-; PPC64-NEXT:    addic 8, 7, -1
-; PPC64-NEXT:    subfe 7, 8, 7
-; PPC64-NEXT:    or 5, 5, 7
+; PPC64:       # %bb.0: # %overflow.entry
+; PPC64-NEXT:    cmpldi 3, 0
+; PPC64-NEXT:    beq 0, .LBB0_3
+; PPC64-NEXT:  # %bb.1: # %overflow.lhs
+; PPC64-NEXT:    cmpldi 5, 0
+; PPC64-NEXT:    beq 0, .LBB0_5
+; PPC64-NEXT:  # %bb.2: # %overflow
+; PPC64-NEXT:    mulhdu. 7, 3, 6
+; PPC64-NEXT:    mcrf 5, 0
+; PPC64-NEXT:    cmpdi 6, 5, 0
+; PPC64-NEXT:    mulhdu. 7, 5, 4
+; PPC64-NEXT:    mcrf 1, 0
+; PPC64-NEXT:    cmpdi 3, 0
+; PPC64-NEXT:    mulld 5, 5, 4
+; PPC64-NEXT:    mulld 3, 3, 6
+; PPC64-NEXT:    crnor 20, 26, 2
+; PPC64-NEXT:    add 3, 3, 5
+; PPC64-NEXT:    crorc 20, 20, 22
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    addc 3, 7, 3
+; PPC64-NEXT:    li 5, 0
+; PPC64-NEXT:    addze. 5, 5
+; PPC64-NEXT:    crorc 20, 20, 6
+; PPC64-NEXT:    crorc 20, 20, 2
 ; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_3: # %overflow.no.lhs
+; PPC64-NEXT:    cmpldi 5, 0
+; PPC64-NEXT:    beq 0, .LBB0_6
+; PPC64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    mulld 8, 3, 6
+; PPC64-NEXT:    mulld 9, 3, 5
+; PPC64-NEXT:    add 3, 7, 8
+; PPC64-NEXT:    mulhdu 7, 4, 5
+; PPC64-NEXT:    mulld 5, 4, 5
+; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    addc 3, 3, 5
+; PPC64-NEXT:    adde. 5, 7, 9
+; PPC64-NEXT:    crnot 20, 2
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_5: # %overflow.no.rhs.only
+; PPC64-NEXT:    mulhdu 7, 6, 4
+; PPC64-NEXT:    mulld 8, 5, 4
+; PPC64-NEXT:    mulld 5, 5, 3
+; PPC64-NEXT:    mulld 4, 6, 4
+; PPC64-NEXT:    add 7, 7, 8
+; PPC64-NEXT:    mulhdu 8, 6, 3
+; PPC64-NEXT:    mulld 3, 6, 3
+; PPC64-NEXT:    addc 3, 7, 3
+; PPC64-NEXT:    adde. 5, 8, 5
+; PPC64-NEXT:    crnot 20, 2
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_6: # %overflow.no
+; PPC64-NEXT:    mulld 5, 4, 5
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    mulld 3, 3, 6
+; PPC64-NEXT:    add 5, 7, 5
+; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    add 3, 5, 3
+; PPC64-NEXT:    crxor 20, 20, 20
+; PPC64-NEXT:  .LBB0_7: # %overflow.res
+; PPC64-NEXT:    li 5, 1
+; PPC64-NEXT:    bclr 12, 20, 0
+; PPC64-NEXT:  # %bb.8: # %overflow.res
+; PPC64-NEXT:    li 5, 0
 ; PPC64-NEXT:    blr
 ;
 ; PPC32-LABEL: muloti_test:
-; PPC32:       # %bb.0: # %start
-; PPC32-NEXT:    stwu 1, -64(1)
-; PPC32-NEXT:    stw 26, 40(1) # 4-byte Folded Spill
+; PPC32:       # %bb.0: # %overflow.entry
+; PPC32-NEXT:    stwu 1, -80(1)
+; PPC32-NEXT:    stw 30, 72(1) # 4-byte Folded Spill
 ; PPC32-NEXT:    mfcr 12
-; PPC32-NEXT:    stw 27, 44(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mullw 27, 9, 4
-; PPC32-NEXT:    stw 21, 20(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mr 11, 7
-; PPC32-NEXT:    stw 22, 24(1) # 4-byte Folded Spill
-; PPC32-NEXT:    li 7, 0
-; PPC32-NEXT:    mullw 26, 3, 10
-; PPC32-NEXT:    stw 23, 28(1) # 4-byte Folded Spill
-; PPC32-NEXT:    add 27, 26, 27
-; PPC32-NEXT:    stw 24, 32(1) # 4-byte Folded Spill
-; PPC32-NEXT:    cmpwi 7, 11, 0
-; PPC32-NEXT:    stw 25, 36(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mullw 24, 11, 6
-; PPC32-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mulhwu 0, 8, 6
-; PPC32-NEXT:    stw 12, 16(1)
-; PPC32-NEXT:    mr 12, 5
-; PPC32-NEXT:    mulhwu 5, 4, 10
-; PPC32-NEXT:    addc 5, 5, 27
-; PPC32-NEXT:    addze 27, 7
-; PPC32-NEXT:    cmpwi 2, 27, 0
-; PPC32-NEXT:    mullw 25, 12, 8
-; PPC32-NEXT:    add 26, 24, 25
-; PPC32-NEXT:    addc 0, 0, 26
-; PPC32-NEXT:    addze 26, 7
-; PPC32-NEXT:    mullw 23, 8, 6
-; PPC32-NEXT:    mullw 22, 4, 10
-; PPC32-NEXT:    addc 24, 22, 23
-; PPC32-NEXT:    adde 22, 5, 0
-; PPC32-NEXT:    mulhwu 29, 6, 10
-; PPC32-NEXT:    mullw 21, 12, 10
-; PPC32-NEXT:    addc 5, 21, 29
-; PPC32-NEXT:    mulhwu 30, 12, 10
-; PPC32-NEXT:    addze 0, 30
-; PPC32-NEXT:    mullw 23, 6, 9
-; PPC32-NEXT:    addc 5, 23, 5
-; PPC32-NEXT:    mulhwu 28, 6, 9
-; PPC32-NEXT:    addze 29, 28
-; PPC32-NEXT:    addc 0, 0, 29
-; PPC32-NEXT:    addze 29, 7
-; PPC32-NEXT:    mullw 30, 12, 9
-; PPC32-NEXT:    addc 0, 30, 0
-; PPC32-NEXT:    mulhwu 25, 12, 9
-; PPC32-NEXT:    adde 30, 25, 29
-; PPC32-NEXT:    addc 0, 0, 24
-; PPC32-NEXT:    adde 30, 30, 22
-; PPC32-NEXT:    addze. 29, 7
+; PPC32-NEXT:    or. 30, 4, 3
+; PPC32-NEXT:    stw 18, 24(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 19, 28(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 20, 32(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 21, 36(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 22, 40(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 23, 44(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 24, 48(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 25, 52(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 26, 56(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 27, 60(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 28, 64(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 29, 68(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 12, 20(1)
+; PPC32-NEXT:    beq 0, .LBB0_3
+; PPC32-NEXT:  # %bb.1: # %overflow.lhs
+; PPC32-NEXT:    or. 29, 8, 7
+; PPC32-NEXT:    beq 0, .LBB0_5
+; PPC32-NEXT:  # %bb.2: # %overflow
+; PPC32-NEXT:    mullw 28, 9, 4
+; PPC32-NEXT:    li 19, 0
+; PPC32-NEXT:    cmpwi 2, 7, 0
+; PPC32-NEXT:    cmpwi 3, 5, 0
+; PPC32-NEXT:    cmpwi 7, 3, 0
+; PPC32-NEXT:    mullw 27, 3, 10
+; PPC32-NEXT:    add 28, 27, 28
+; PPC32-NEXT:    mulhwu 11, 4, 10
+; PPC32-NEXT:    addc 11, 11, 28
+; PPC32-NEXT:    addze 28, 19
+; PPC32-NEXT:    mullw 24, 5, 8
+; PPC32-NEXT:    mullw 23, 7, 6
+; PPC32-NEXT:    add 27, 23, 24
+; PPC32-NEXT:    mulhwu 12, 8, 6
+; PPC32-NEXT:    addc 12, 12, 27
+; PPC32-NEXT:    addze 27, 19
+; PPC32-NEXT:    mullw 22, 8, 6
+; PPC32-NEXT:    mullw 21, 4, 10
+; PPC32-NEXT:    addc 23, 21, 22
+; PPC32-NEXT:    adde 21, 11, 12
+; PPC32-NEXT:    mulhwu 26, 6, 10
+; PPC32-NEXT:    mullw 20, 5, 10
+; PPC32-NEXT:    addc 11, 20, 26
+; PPC32-NEXT:    mulhwu 0, 5, 10
+; PPC32-NEXT:    addze 12, 0
+; PPC32-NEXT:    mullw 22, 6, 9
+; PPC32-NEXT:    addc 11, 22, 11
+; PPC32-NEXT:    mulhwu 25, 6, 9
+; PPC32-NEXT:    addze 26, 25
+; PPC32-NEXT:    addc 12, 12, 26
+; PPC32-NEXT:    addze 26, 19
+; PPC32-NEXT:    mullw 0, 5, 9
+; PPC32-NEXT:    addc 12, 0, 12
+; PPC32-NEXT:    mulhwu 24, 5, 9
+; PPC32-NEXT:    adde 0, 24, 26
+; PPC32-NEXT:    addc 12, 12, 23
+; PPC32-NEXT:    adde 0, 0, 21
+; PPC32-NEXT:    addze. 26, 19
 ; PPC32-NEXT:    mcrf 1, 0
-; PPC32-NEXT:    mulhwu. 29, 11, 6
-; PPC32-NEXT:    mcrf 6, 0
-; PPC32-NEXT:    mulhwu. 29, 12, 8
+; PPC32-NEXT:    mulhwu. 26, 7, 6
 ; PPC32-NEXT:    mcrf 5, 0
-; PPC32-NEXT:    cmpwi 12, 0
-; PPC32-NEXT:    crnor 20, 2, 30
-; PPC32-NEXT:    cmpwi 3, 0
-; PPC32-NEXT:    cmpwi 7, 9, 0
-; PPC32-NEXT:    crnor 24, 30, 2
-; PPC32-NEXT:    mulhwu. 12, 3, 10
-; PPC32-NEXT:    crorc 20, 20, 26
-; PPC32-NEXT:    mcrf 7, 0
+; PPC32-NEXT:    crnor 20, 14, 10
 ; PPC32-NEXT:    crorc 20, 20, 22
-; PPC32-NEXT:    cmpwi 26, 0
-; PPC32-NEXT:    crorc 28, 20, 2
-; PPC32-NEXT:    mulhwu. 9, 9, 4
-; PPC32-NEXT:    mcrf 5, 0
-; PPC32-NEXT:    crorc 20, 24, 30
-; PPC32-NEXT:    or. 3, 4, 3
+; PPC32-NEXT:    cmpwi 2, 30, 0
+; PPC32-NEXT:    cmpwi 3, 29, 0
+; PPC32-NEXT:    mulhwu. 5, 5, 8
 ; PPC32-NEXT:    mcrf 6, 0
-; PPC32-NEXT:    crorc 20, 20, 22
-; PPC32-NEXT:    or. 3, 8, 11
-; PPC32-NEXT:    crorc 20, 20, 10
-; PPC32-NEXT:    crnor 21, 2, 26
+; PPC32-NEXT:    cmpwi 9, 0
+; PPC32-NEXT:    crnor 21, 2, 30
+; PPC32-NEXT:    crorc 20, 20, 26
+; PPC32-NEXT:    crnor 23, 14, 10
+; PPC32-NEXT:    mulhwu. 3, 3, 10
+; PPC32-NEXT:    mcrf 7, 0
+; PPC32-NEXT:    cmpwi 27, 0
+; PPC32-NEXT:    crorc 20, 20, 2
+; PPC32-NEXT:    crorc 21, 21, 30
+; PPC32-NEXT:    mulhwu. 3, 9, 4
+; PPC32-NEXT:    crorc 21, 21, 2
+; PPC32-NEXT:    cmpwi 28, 0
+; PPC32-NEXT:    crorc 21, 21, 2
+; PPC32-NEXT:    cror 21, 23, 21
 ; PPC32-NEXT:    cror 20, 21, 20
-; PPC32-NEXT:    cror 20, 20, 28
-; PPC32-NEXT:    crandc 20, 6, 20
+; PPC32-NEXT:    crorc 20, 20, 6
 ; PPC32-NEXT:    mullw 6, 6, 10
-; PPC32-NEXT:    bc 12, 20, .LBB0_2
-; PPC32-NEXT:  # %bb.1: # %start
 ; PPC32-NEXT:    li 7, 1
-; PPC32-NEXT:  .LBB0_2: # %start
-; PPC32-NEXT:    lwz 12, 16(1)
-; PPC32-NEXT:    mr 3, 30
-; PPC32-NEXT:    mr 4, 0
-; PPC32-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
+; PPC32-NEXT:    bc 4, 20, .LBB0_7
+; PPC32-NEXT:    b .LBB0_8
+; PPC32-NEXT:  .LBB0_3: # %overflow.no.lhs
+; PPC32-NEXT:    or. 11, 8, 7
+; PPC32-NEXT:    beq 0, .LBB0_9
+; PPC32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; PPC32-NEXT:    mulhwu 29, 10, 4
+; PPC32-NEXT:    mullw 20, 10, 3
+; PPC32-NEXT:    add 29, 29, 20
+; PPC32-NEXT:    mulhwu 12, 6, 10
+; PPC32-NEXT:    mulhwu 0, 6, 9
+; PPC32-NEXT:    mulhwu 30, 5, 9
+; PPC32-NEXT:    mulhwu 24, 8, 4
+; PPC32-NEXT:    mullw 23, 5, 10
+; PPC32-NEXT:    addc 12, 23, 12
+; PPC32-NEXT:    mullw 22, 6, 9
+; PPC32-NEXT:    mullw 21, 5, 9
+; PPC32-NEXT:    mullw 9, 9, 4
+; PPC32-NEXT:    add 9, 29, 9
+; PPC32-NEXT:    mullw 3, 8, 3
+; PPC32-NEXT:    add 3, 24, 3
+; PPC32-NEXT:    mulhwu 11, 5, 10
+; PPC32-NEXT:    mullw 29, 7, 4
+; PPC32-NEXT:    add 3, 3, 29
+; PPC32-NEXT:    addze 29, 11
+; PPC32-NEXT:    addc 11, 22, 12
+; PPC32-NEXT:    addze 0, 0
+; PPC32-NEXT:    li 12, 0
+; PPC32-NEXT:    addc 0, 29, 0
+; PPC32-NEXT:    addze 29, 12
+; PPC32-NEXT:    addc 0, 21, 0
+; PPC32-NEXT:    mullw 19, 10, 4
+; PPC32-NEXT:    adde 30, 30, 29
+; PPC32-NEXT:    addc 0, 0, 19
+; PPC32-NEXT:    adde 9, 30, 9
+; PPC32-NEXT:    mulhwu 27, 6, 8
+; PPC32-NEXT:    mullw 18, 5, 8
+; PPC32-NEXT:    addc 30, 18, 27
+; PPC32-NEXT:    mulhwu 28, 5, 8
+; PPC32-NEXT:    addze 29, 28
+; PPC32-NEXT:    mulhwu 26, 6, 7
+; PPC32-NEXT:    mulhwu 25, 5, 7
+; PPC32-NEXT:    mullw 5, 5, 7
+; PPC32-NEXT:    mullw 7, 6, 7
+; PPC32-NEXT:    addc 7, 7, 30
+; PPC32-NEXT:    addze 30, 26
+; PPC32-NEXT:    addc 30, 29, 30
+; PPC32-NEXT:    addze 12, 12
+; PPC32-NEXT:    addc 5, 5, 30
+; PPC32-NEXT:    mullw 4, 8, 4
+; PPC32-NEXT:    adde 12, 25, 12
+; PPC32-NEXT:    addc 4, 5, 4
+; PPC32-NEXT:    adde 3, 12, 3
+; PPC32-NEXT:    mullw 5, 6, 8
+; PPC32-NEXT:    addc 12, 0, 5
+; PPC32-NEXT:    adde 0, 9, 7
+; PPC32-NEXT:    addze 4, 4
+; PPC32-NEXT:    addze 3, 3
+; PPC32-NEXT:    or. 3, 4, 3
+; PPC32-NEXT:    mullw 6, 6, 10
+; PPC32-NEXT:    b .LBB0_6
+; PPC32-NEXT:  .LBB0_5: # %overflow.no.rhs.only
+; PPC32-NEXT:    mulhwu 29, 6, 8
+; PPC32-NEXT:    mullw 20, 6, 7
+; PPC32-NEXT:    add 29, 29, 20
+; PPC32-NEXT:    mulhwu 12, 10, 6
+; PPC32-NEXT:    mulhwu 0, 10, 5
+; PPC32-NEXT:    mulhwu 30, 9, 5
+; PPC32-NEXT:    mulhwu 24, 4, 8
+; PPC32-NEXT:    mullw 23, 9, 6
+; PPC32-NEXT:    addc 12, 23, 12
+; PPC32-NEXT:    mullw 22, 10, 5
+; PPC32-NEXT:    mullw 21, 9, 5
+; PPC32-NEXT:    mullw 5, 5, 8
+; PPC32-NEXT:    add 5, 29, 5
+; PPC32-NEXT:    mullw 7, 4, 7
+; PPC32-NEXT:    add 7, 24, 7
+; PPC32-NEXT:    mulhwu 11, 9, 6
+; PPC32-NEXT:    mullw 29, 3, 8
+; PPC32-NEXT:    add 7, 7, 29
+; PPC32-NEXT:    addze 29, 11
+; PPC32-NEXT:    addc 11, 22, 12
+; PPC32-NEXT:    addze 0, 0
+; PPC32-NEXT:    li 12, 0
+; PPC32-NEXT:    addc 0, 29, 0
+; PPC32-NEXT:    addze 29, 12
+; PPC32-NEXT:    addc 0, 21, 0
+; PPC32-NEXT:    mullw 19, 6, 8
+; PPC32-NEXT:    adde 30, 30, 29
+; PPC32-NEXT:    addc 0, 0, 19
+; PPC32-NEXT:    adde 5, 30, 5
+; PPC32-NEXT:    mulhwu 27, 10, 4
+; PPC32-NEXT:    mullw 18, 9, 4
+; PPC32-NEXT:    addc 30, 18, 27
+; PPC32-NEXT:    mulhwu 28, 9, 4
+; PPC32-NEXT:    addze 29, 28
+; PPC32-NEXT:    mulhwu 26, 10, 3
+; PPC32-NEXT:    mulhwu 25, 9, 3
+; PPC32-NEXT:    mullw 9, 9, 3
+; PPC32-NEXT:    mullw 3, 10, 3
+; PPC32-NEXT:    addc 3, 3, 30
+; PPC32-NEXT:    addze 30, 26
+; PPC32-NEXT:    addc 30, 29, 30
+; PPC32-NEXT:    addze 12, 12
+; PPC32-NEXT:    addc 9, 9, 30
+; PPC32-NEXT:    mullw 8, 4, 8
+; PPC32-NEXT:    adde 12, 25, 12
+; PPC32-NEXT:    addc 8, 9, 8
+; PPC32-NEXT:    adde 7, 12, 7
+; PPC32-NEXT:    mullw 4, 10, 4
+; PPC32-NEXT:    addc 12, 0, 4
+; PPC32-NEXT:    adde 0, 5, 3
+; PPC32-NEXT:    addze 3, 8
+; PPC32-NEXT:    addze 4, 7
+; PPC32-NEXT:    or. 3, 3, 4
+; PPC32-NEXT:    mullw 6, 10, 6
+; PPC32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
+; PPC32-NEXT:    crnot 20, 2
+; PPC32-NEXT:    li 7, 1
+; PPC32-NEXT:    bc 12, 20, .LBB0_8
+; PPC32-NEXT:  .LBB0_7: # %overflow.res
+; PPC32-NEXT:    li 7, 0
+; PPC32-NEXT:  .LBB0_8: # %overflow.res
+; PPC32-NEXT:    mr 4, 12
+; PPC32-NEXT:    lwz 12, 20(1)
+; PPC32-NEXT:    mr 3, 0
+; PPC32-NEXT:    mr 5, 11
+; PPC32-NEXT:    lwz 30, 72(1) # 4-byte Folded Reload
 ; PPC32-NEXT:    mtcrf 32, 12 # cr2
-; PPC32-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 27, 44(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 26, 40(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 25, 36(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 24, 32(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 23, 28(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 22, 24(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 21, 20(1) # 4-byte Folded Reload
-; PPC32-NEXT:    addi 1, 1, 64
+; PPC32-NEXT:    mtcrf 16, 12 # cr3
+; PPC32-NEXT:    lwz 29, 68(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 28, 64(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 27, 60(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 26, 56(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 25, 52(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 24, 48(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 23, 44(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 22, 40(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 21, 36(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 20, 32(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 19, 28(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 18, 24(1) # 4-byte Folded Reload
+; PPC32-NEXT:    addi 1, 1, 80
 ; PPC32-NEXT:    blr
+; PPC32-NEXT:  .LBB0_9: # %overflow.no
+; PPC32-NEXT:    mulhwu 11, 10, 4
+; PPC32-NEXT:    mulhwu 12, 8, 6
+; PPC32-NEXT:    mullw 3, 10, 3
+; PPC32-NEXT:    add 3, 11, 3
+; PPC32-NEXT:    mullw 26, 8, 5
+; PPC32-NEXT:    mulhwu 0, 5, 10
+; PPC32-NEXT:    mulhwu 30, 6, 10
+; PPC32-NEXT:    mulhwu 29, 6, 9
+; PPC32-NEXT:    mulhwu 28, 5, 9
+; PPC32-NEXT:    mullw 27, 9, 4
+; PPC32-NEXT:    add 3, 3, 27
+; PPC32-NEXT:    mullw 7, 7, 6
+; PPC32-NEXT:    mullw 4, 10, 4
+; PPC32-NEXT:    mullw 8, 8, 6
+; PPC32-NEXT:    addc 4, 8, 4
+; PPC32-NEXT:    li 8, 0
+; PPC32-NEXT:    mullw 25, 5, 10
+; PPC32-NEXT:    mullw 5, 5, 9
+; PPC32-NEXT:    mullw 9, 6, 9
+; PPC32-NEXT:    mullw 6, 6, 10
+; PPC32-NEXT:    add 10, 12, 26
+; PPC32-NEXT:    add 7, 10, 7
+; PPC32-NEXT:    adde 3, 7, 3
+; PPC32-NEXT:    addc 7, 25, 30
+; PPC32-NEXT:    addze 10, 0
+; PPC32-NEXT:    addc 11, 9, 7
+; PPC32-NEXT:    addze 7, 29
+; PPC32-NEXT:    addc 7, 10, 7
+; PPC32-NEXT:    addze 8, 8
+; PPC32-NEXT:    addc 5, 5, 7
+; PPC32-NEXT:    adde 7, 28, 8
+; PPC32-NEXT:    addc 12, 5, 4
+; PPC32-NEXT:    adde 0, 7, 3
+; PPC32-NEXT:    li 7, 1
+; PPC32-NEXT:    b .LBB0_7
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index d6fd4f15c4e53..4c9aeaa3ba5a1 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-LABEL: muloti_test:
-; RISCV32:       # %bb.0: # %start
+; RISCV32:       # %bb.0: # %overflow.entry
 ; RISCV32-NEXT:    addi sp, sp, -32
 ; RISCV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
@@ -11,100 +11,301 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a4, 0(a1)
+; RISCV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RISCV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RISCV32-NEXT:    lw a3, 0(a1)
 ; RISCV32-NEXT:    lw t0, 4(a1)
-; RISCV32-NEXT:    lw a3, 8(a1)
-; RISCV32-NEXT:    lw a1, 12(a1)
-; RISCV32-NEXT:    lw a6, 0(a2)
-; RISCV32-NEXT:    lw a5, 4(a2)
-; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw a4, 8(a1)
+; RISCV32-NEXT:    lw a6, 12(a1)
+; RISCV32-NEXT:    lw a1, 0(a2)
+; RISCV32-NEXT:    lw a7, 4(a2)
+; RISCV32-NEXT:    lw a5, 8(a2)
 ; RISCV32-NEXT:    lw a2, 12(a2)
-; RISCV32-NEXT:    mulhu t1, a4, a6
-; RISCV32-NEXT:    mul t2, t0, a6
-; RISCV32-NEXT:    mulhu t3, t0, a6
-; RISCV32-NEXT:    mul t4, a4, a5
-; RISCV32-NEXT:    mulhu t5, a4, a5
+; RISCV32-NEXT:    or t4, a4, a6
+; RISCV32-NEXT:    beqz t4, .LBB0_5
+; RISCV32-NEXT:  # %bb.1: # %overflow.lhs
+; RISCV32-NEXT:    or t5, a5, a2
+; RISCV32-NEXT:    beqz t5, .LBB0_9
+; RISCV32-NEXT:  # %bb.2: # %overflow
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t2, t0, a1
+; RISCV32-NEXT:    mulhu t3, t0, a1
+; RISCV32-NEXT:    mul t6, a3, a7
+; RISCV32-NEXT:    mulhu s0, a3, a7
+; RISCV32-NEXT:    mul s4, t0, a7
+; RISCV32-NEXT:    mul s1, a5, a3
+; RISCV32-NEXT:    mul s5, a4, a1
 ; RISCV32-NEXT:    mul s2, t0, a5
-; RISCV32-NEXT:    mul t6, a7, a4
-; RISCV32-NEXT:    mul s3, a3, a6
-; RISCV32-NEXT:    mul s0, t0, a7
-; RISCV32-NEXT:    mul s1, a2, a4
-; RISCV32-NEXT:    mul s4, a5, a3
-; RISCV32-NEXT:    add s1, s1, s0
-; RISCV32-NEXT:    mul s0, a1, a6
-; RISCV32-NEXT:    add s4, s0, s4
-; RISCV32-NEXT:    mulhu s5, t0, a5
+; RISCV32-NEXT:    mul s3, a2, a3
+; RISCV32-NEXT:    mul s6, a7, a4
+; RISCV32-NEXT:    add s3, s3, s2
+; RISCV32-NEXT:    mul s2, a6, a1
+; RISCV32-NEXT:    add s6, s2, s6
+; RISCV32-NEXT:    mulhu s7, t0, a7
 ; RISCV32-NEXT:    add t1, t2, t1
 ; RISCV32-NEXT:    sltu t2, t1, t2
 ; RISCV32-NEXT:    add t2, t3, t2
-; RISCV32-NEXT:    mulhu s0, a7, a4
-; RISCV32-NEXT:    add t1, t4, t1
-; RISCV32-NEXT:    sltu t3, t1, t4
-; RISCV32-NEXT:    add t3, t5, t3
-; RISCV32-NEXT:    mulhu t5, a3, a6
-; RISCV32-NEXT:    add t4, s3, t6
-; RISCV32-NEXT:    add s1, s0, s1
-; RISCV32-NEXT:    add t6, t5, s4
-; RISCV32-NEXT:    sltu s3, t4, s3
+; RISCV32-NEXT:    mulhu s2, a5, a3
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t3, t1, t6
+; RISCV32-NEXT:    add t3, s0, t3
+; RISCV32-NEXT:    mulhu s0, a4, a1
+; RISCV32-NEXT:    add t6, s5, s1
+; RISCV32-NEXT:    add s3, s2, s3
+; RISCV32-NEXT:    add s1, s0, s6
+; RISCV32-NEXT:    sltu s5, t6, s5
 ; RISCV32-NEXT:    add t3, t2, t3
 ; RISCV32-NEXT:    sltu t2, t3, t2
-; RISCV32-NEXT:    add s5, s5, t2
-; RISCV32-NEXT:    add s4, t6, s1
-; RISCV32-NEXT:    add t3, s2, t3
-; RISCV32-NEXT:    add t2, t3, t4
-; RISCV32-NEXT:    sltu s2, t3, s2
-; RISCV32-NEXT:    sltu t4, t2, t3
-; RISCV32-NEXT:    add s2, s5, s2
-; RISCV32-NEXT:    add s3, s4, s3
-; RISCV32-NEXT:    add t3, s2, s3
-; RISCV32-NEXT:    add t3, t3, t4
-; RISCV32-NEXT:    beq t3, s2, .LBB0_2
-; RISCV32-NEXT:  # %bb.1: # %start
-; RISCV32-NEXT:    sltu t4, t3, s2
-; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    sltu s0, s1, s0
-; RISCV32-NEXT:    snez s1, t0
-; RISCV32-NEXT:    snez s2, a2
-; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    mulhu t6, a2, a4
-; RISCV32-NEXT:    mulhu t0, t0, a7
-; RISCV32-NEXT:    or a2, a7, a2
-; RISCV32-NEXT:    snez a7, a5
-; RISCV32-NEXT:    mul a4, a4, a6
-; RISCV32-NEXT:    mulhu a6, a1, a6
-; RISCV32-NEXT:    mulhu a5, a5, a3
-; RISCV32-NEXT:    or a3, a3, a1
-; RISCV32-NEXT:    snez a1, a1
-; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    snez t6, t6
-; RISCV32-NEXT:    snez t0, t0
-; RISCV32-NEXT:    and a1, a1, a7
-; RISCV32-NEXT:    snez a6, a6
-; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    add s7, s7, t2
+; RISCV32-NEXT:    add s6, s1, s3
+; RISCV32-NEXT:    add t3, s4, t3
+; RISCV32-NEXT:    add t2, t3, t6
+; RISCV32-NEXT:    sltu s4, t3, s4
+; RISCV32-NEXT:    sltu t6, t2, t3
+; RISCV32-NEXT:    add s4, s7, s4
+; RISCV32-NEXT:    add s5, s6, s5
+; RISCV32-NEXT:    add t3, s4, s5
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    beq t3, s4, .LBB0_4
+; RISCV32-NEXT:  # %bb.3: # %overflow
+; RISCV32-NEXT:    sltu t6, t3, s4
+; RISCV32-NEXT:  .LBB0_4: # %overflow
+; RISCV32-NEXT:    sltu s2, s3, s2
+; RISCV32-NEXT:    snez s3, t0
+; RISCV32-NEXT:    snez s4, a2
+; RISCV32-NEXT:    mulhu a2, a2, a3
+; RISCV32-NEXT:    mulhu a5, t0, a5
+; RISCV32-NEXT:    sltu t0, s1, s0
+; RISCV32-NEXT:    snez s0, a7
+; RISCV32-NEXT:    snez s1, a6
+; RISCV32-NEXT:    mulhu a6, a6, a1
+; RISCV32-NEXT:    mulhu a4, a7, a4
+; RISCV32-NEXT:    snez a7, t5
+; RISCV32-NEXT:    snez t4, t4
+; RISCV32-NEXT:    and t5, s4, s3
 ; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    or a7, s1, t6
-; RISCV32-NEXT:    or a1, a1, a6
-; RISCV32-NEXT:    and a2, a3, a2
-; RISCV32-NEXT:    or a3, a7, t0
-; RISCV32-NEXT:    or a1, a1, a5
-; RISCV32-NEXT:    or a3, a3, s0
-; RISCV32-NEXT:    or a1, a1, t5
-; RISCV32-NEXT:    or a1, a2, a1
-; RISCV32-NEXT:    or a1, a1, a3
-; RISCV32-NEXT:    or a1, a1, t4
-; RISCV32-NEXT:    andi a1, a1, 1
-; RISCV32-NEXT:    sw a4, 0(a0)
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    and s0, s1, s0
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a7, t4, a7
+; RISCV32-NEXT:    or a2, t5, a2
+; RISCV32-NEXT:    or a6, s0, a6
+; RISCV32-NEXT:    or a2, a2, a5
+; RISCV32-NEXT:    or a4, a6, a4
+; RISCV32-NEXT:    or a2, a2, s2
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a4, a7, a4
+; RISCV32-NEXT:    or a2, a4, a2
+; RISCV32-NEXT:    or t4, a2, t6
+; RISCV32-NEXT:    j .LBB0_14
+; RISCV32-NEXT:  .LBB0_5: # %overflow.no.lhs
+; RISCV32-NEXT:    or t1, a5, a2
+; RISCV32-NEXT:    beqz t1, .LBB0_13
+; RISCV32-NEXT:  # %bb.6: # %overflow.no.lhs.only
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t6, t0, a1
+; RISCV32-NEXT:    mulhu s0, t0, a1
+; RISCV32-NEXT:    mul t4, a3, a7
+; RISCV32-NEXT:    mulhu t5, a3, a7
+; RISCV32-NEXT:    mul t2, t0, a7
+; RISCV32-NEXT:    mulhu t3, t0, a7
+; RISCV32-NEXT:    mulhu s1, a1, a4
+; RISCV32-NEXT:    mul s2, a1, a6
+; RISCV32-NEXT:    mul a7, a7, a4
+; RISCV32-NEXT:    add s1, s1, s2
+; RISCV32-NEXT:    mulhu s2, a5, a4
+; RISCV32-NEXT:    mul a6, a5, a6
+; RISCV32-NEXT:    add a6, s2, a6
+; RISCV32-NEXT:    mulhu s2, a3, a5
+; RISCV32-NEXT:    add a7, s1, a7
+; RISCV32-NEXT:    mul s1, a2, a4
+; RISCV32-NEXT:    add a6, a6, s1
+; RISCV32-NEXT:    mul s1, t0, a5
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t6, t1, t6
+; RISCV32-NEXT:    add t6, s0, t6
+; RISCV32-NEXT:    mulhu s0, t0, a5
+; RISCV32-NEXT:    add s2, s1, s2
+; RISCV32-NEXT:    sltu s1, s2, s1
+; RISCV32-NEXT:    add s0, s0, s1
+; RISCV32-NEXT:    mul s1, a3, a2
+; RISCV32-NEXT:    add t1, t4, t1
+; RISCV32-NEXT:    sltu t4, t1, t4
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    mul t5, t0, a2
+; RISCV32-NEXT:    mulhu t0, t0, a2
+; RISCV32-NEXT:    mulhu a2, a3, a2
+; RISCV32-NEXT:    add s2, s1, s2
+; RISCV32-NEXT:    sltu s1, s2, s1
+; RISCV32-NEXT:    add a2, a2, s1
+; RISCV32-NEXT:    mul s1, a1, a4
+; RISCV32-NEXT:    mul a4, a5, a4
+; RISCV32-NEXT:    mul a5, a3, a5
+; RISCV32-NEXT:    add t4, t6, t4
+; RISCV32-NEXT:    add a2, s0, a2
+; RISCV32-NEXT:    sltu t6, t4, t6
+; RISCV32-NEXT:    add t4, t2, t4
+; RISCV32-NEXT:    sltu s0, a2, s0
+; RISCV32-NEXT:    add s3, t5, a2
+; RISCV32-NEXT:    add s1, t4, s1
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    add a2, s3, a4
+; RISCV32-NEXT:    sltu a4, s3, t5
+; RISCV32-NEXT:    add t0, t0, s0
+; RISCV32-NEXT:    sltu t4, s1, t4
+; RISCV32-NEXT:    add t3, t3, t2
+; RISCV32-NEXT:    sltu t5, a2, s3
+; RISCV32-NEXT:    add a4, t0, a4
+; RISCV32-NEXT:    add t2, s1, a5
+; RISCV32-NEXT:    add a7, t3, a7
+; RISCV32-NEXT:    add a5, a4, a6
+; RISCV32-NEXT:    sltu a4, t2, s1
+; RISCV32-NEXT:    add a6, a7, t4
+; RISCV32-NEXT:    add t3, s2, a4
+; RISCV32-NEXT:    add t3, a6, t3
+; RISCV32-NEXT:    add a5, a5, t5
+; RISCV32-NEXT:    beq t3, a6, .LBB0_8
+; RISCV32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; RISCV32-NEXT:    sltu a4, t3, a6
+; RISCV32-NEXT:  .LBB0_8: # %overflow.no.lhs.only
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:    j .LBB0_12
+; RISCV32-NEXT:  .LBB0_9: # %overflow.no.rhs.only
+; RISCV32-NEXT:    mulhu t1, a1, a3
+; RISCV32-NEXT:    mul t6, a7, a3
+; RISCV32-NEXT:    mulhu s0, a7, a3
+; RISCV32-NEXT:    mul t4, a1, t0
+; RISCV32-NEXT:    mulhu t5, a1, t0
+; RISCV32-NEXT:    mul t2, a7, t0
+; RISCV32-NEXT:    mulhu t3, a7, t0
+; RISCV32-NEXT:    mulhu s1, a3, a5
+; RISCV32-NEXT:    mul s2, a3, a2
+; RISCV32-NEXT:    mul t0, t0, a5
+; RISCV32-NEXT:    add s1, s1, s2
+; RISCV32-NEXT:    mulhu s2, a4, a5
+; RISCV32-NEXT:    mul a2, a4, a2
+; RISCV32-NEXT:    add a2, s2, a2
+; RISCV32-NEXT:    mulhu s2, a1, a4
+; RISCV32-NEXT:    add t0, s1, t0
+; RISCV32-NEXT:    mul s1, a6, a5
+; RISCV32-NEXT:    add s1, a2, s1
+; RISCV32-NEXT:    mul a2, a7, a4
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t6, t1, t6
+; RISCV32-NEXT:    add t6, s0, t6
+; RISCV32-NEXT:    mulhu s0, a7, a4
+; RISCV32-NEXT:    add s2, a2, s2
+; RISCV32-NEXT:    sltu a2, s2, a2
+; RISCV32-NEXT:    add a2, s0, a2
+; RISCV32-NEXT:    mul s0, a1, a6
+; RISCV32-NEXT:    add t1, t4, t1
+; RISCV32-NEXT:    sltu t4, t1, t4
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    mul t5, a7, a6
+; RISCV32-NEXT:    mulhu a7, a7, a6
+; RISCV32-NEXT:    mulhu a6, a1, a6
+; RISCV32-NEXT:    add s2, s0, s2
+; RISCV32-NEXT:    sltu s0, s2, s0
+; RISCV32-NEXT:    add a6, a6, s0
+; RISCV32-NEXT:    mul s0, a3, a5
+; RISCV32-NEXT:    mul a5, a4, a5
+; RISCV32-NEXT:    mul a4, a1, a4
+; RISCV32-NEXT:    add t4, t6, t4
+; RISCV32-NEXT:    add a6, a2, a6
+; RISCV32-NEXT:    sltu t6, t4, t6
+; RISCV32-NEXT:    add t4, t2, t4
+; RISCV32-NEXT:    sltu s3, a6, a2
+; RISCV32-NEXT:    add a6, t5, a6
+; RISCV32-NEXT:    add s0, t4, s0
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    add a2, a6, a5
+; RISCV32-NEXT:    sltu a5, a6, t5
+; RISCV32-NEXT:    add a7, a7, s3
+; RISCV32-NEXT:    sltu t4, s0, t4
+; RISCV32-NEXT:    add t3, t3, t2
+; RISCV32-NEXT:    sltu t5, a2, a6
+; RISCV32-NEXT:    add a5, a7, a5
+; RISCV32-NEXT:    add t2, s0, a4
+; RISCV32-NEXT:    add a6, t3, t0
+; RISCV32-NEXT:    add a5, a5, s1
+; RISCV32-NEXT:    sltu a4, t2, s0
+; RISCV32-NEXT:    add a6, a6, t4
+; RISCV32-NEXT:    add t3, s2, a4
+; RISCV32-NEXT:    add t3, a6, t3
+; RISCV32-NEXT:    add a5, a5, t5
+; RISCV32-NEXT:    beq t3, a6, .LBB0_11
+; RISCV32-NEXT:  # %bb.10: # %overflow.no.rhs.only
+; RISCV32-NEXT:    sltu a4, t3, a6
+; RISCV32-NEXT:  .LBB0_11: # %overflow.no.rhs.only
+; RISCV32-NEXT:    mul a1, a1, a3
+; RISCV32-NEXT:  .LBB0_12: # %overflow.res
+; RISCV32-NEXT:    add a4, a2, a4
+; RISCV32-NEXT:    sltu a2, a4, a2
+; RISCV32-NEXT:    add a2, a5, a2
+; RISCV32-NEXT:    or a2, a4, a2
+; RISCV32-NEXT:    snez t4, a2
+; RISCV32-NEXT:    j .LBB0_15
+; RISCV32-NEXT:  .LBB0_13: # %overflow.no
+; RISCV32-NEXT:    li t4, 0
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t2, t0, a1
+; RISCV32-NEXT:    mulhu t3, t0, a1
+; RISCV32-NEXT:    mul t5, a3, a7
+; RISCV32-NEXT:    mulhu t6, a3, a7
+; RISCV32-NEXT:    mul s0, t0, a7
+; RISCV32-NEXT:    mul s1, a5, t0
+; RISCV32-NEXT:    mulhu s2, a5, a3
+; RISCV32-NEXT:    add s1, s2, s1
+; RISCV32-NEXT:    mul s2, a1, a4
+; RISCV32-NEXT:    mul a5, a5, a3
+; RISCV32-NEXT:    mulhu t0, t0, a7
+; RISCV32-NEXT:    mul a2, a2, a3
+; RISCV32-NEXT:    mul a7, a7, a4
+; RISCV32-NEXT:    mulhu a4, a1, a4
+; RISCV32-NEXT:    mul a6, a1, a6
+; RISCV32-NEXT:    add t1, t2, t1
+; RISCV32-NEXT:    add s2, a5, s2
+; RISCV32-NEXT:    add a4, a4, a6
+; RISCV32-NEXT:    sltu a6, t1, t2
+; RISCV32-NEXT:    add t1, t5, t1
+; RISCV32-NEXT:    add a2, s1, a2
+; RISCV32-NEXT:    add a4, a4, a7
+; RISCV32-NEXT:    sltu a5, s2, a5
+; RISCV32-NEXT:    add a6, t3, a6
+; RISCV32-NEXT:    sltu a7, t1, t5
+; RISCV32-NEXT:    add a2, a2, a4
+; RISCV32-NEXT:    add a7, t6, a7
+; RISCV32-NEXT:    add a2, a2, a5
+; RISCV32-NEXT:    add a7, a6, a7
+; RISCV32-NEXT:    add a4, s0, a7
+; RISCV32-NEXT:    sltu a5, a7, a6
+; RISCV32-NEXT:    add t2, a4, s2
+; RISCV32-NEXT:    sltu a6, a4, s0
+; RISCV32-NEXT:    add a5, t0, a5
+; RISCV32-NEXT:    sltu t3, t2, a4
+; RISCV32-NEXT:    add a5, a5, a6
+; RISCV32-NEXT:    add a2, a5, a2
+; RISCV32-NEXT:    add t3, a2, t3
+; RISCV32-NEXT:  .LBB0_14: # %overflow.res
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:  .LBB0_15: # %overflow.res
+; RISCV32-NEXT:    andi a2, t4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
 ; RISCV32-NEXT:    sw t1, 4(a0)
 ; RISCV32-NEXT:    sw t2, 8(a0)
 ; RISCV32-NEXT:    sw t3, 12(a0)
-; RISCV32-NEXT:    sb a1, 16(a0)
+; RISCV32-NEXT:    sb a2, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RISCV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RISCV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    addi sp, sp, 32
 ; RISCV32-NEXT:    ret
 start:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index a30593d7d7afb..0dac74355d2e9 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1314,38 +1314,173 @@ entry:
 
 define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: smulo.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a6, a0, 31
+; RV32-NEXT:    srai a5, a2, 31
+; RV32-NEXT:    beq a1, a6, .LBB21_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a5, .LBB21_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a5, a0, a2
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhsu a7, a1, a2
 ; RV32-NEXT:    mul t0, a3, a0
 ; RV32-NEXT:    mulh t1, a1, a3
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    mul t2, a1, a3
 ; RV32-NEXT:    mulhsu a3, a3, a0
-; RV32-NEXT:    mul a2, a0, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a0, a5, a6
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    add a0, a7, a0
-; RV32-NEXT:    sltu a6, a5, t0
-; RV32-NEXT:    srai a7, a5, 31
+; RV32-NEXT:    add a1, a6, a5
+; RV32-NEXT:    sltu a5, a1, a6
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    add a5, a7, a5
+; RV32-NEXT:    sltu a6, a1, t0
 ; RV32-NEXT:    add a3, a3, a6
-; RV32-NEXT:    srai a6, a0, 31
-; RV32-NEXT:    add t0, a0, a3
-; RV32-NEXT:    srai a3, a3, 31
-; RV32-NEXT:    sltu a0, t0, a0
+; RV32-NEXT:    srai a6, a5, 31
+; RV32-NEXT:    srai a7, a3, 31
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    srai a7, a1, 31
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    sltu a5, a3, a5
+; RV32-NEXT:    add a3, t2, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a6, a3, t2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a5, t1, a5
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    xor a5, a5, a7
+; RV32-NEXT:    or a3, a3, a5
+; RV32-NEXT:    snez a5, a3
+; RV32-NEXT:    j .LBB21_9
+; RV32-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a5, .LBB21_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB21_10
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a0
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    bgez a1, .LBB21_11
+; RV32-NEXT:    j .LBB21_12
+; RV32-NEXT:  .LBB21_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB21_14
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    bgez a3, .LBB21_15
+; RV32-NEXT:    j .LBB21_16
+; RV32-NEXT:  .LBB21_8: # %overflow.no
+; RV32-NEXT:    li a5, 0
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    mul a3, a0, a3
 ; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:  .LBB21_9: # %overflow.res
+; RV32-NEXT:    mul a2, a0, a2
+; RV32-NEXT:    j .LBB21_27
+; RV32-NEXT:  .LBB21_10:
+; RV32-NEXT:    neg a5, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    bltz a1, .LBB21_12
+; RV32-NEXT:  .LBB21_11: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    mv a5, a0
+; RV32-NEXT:  .LBB21_12: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB21_18
+; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB21_19
+; RV32-NEXT:  .LBB21_14:
+; RV32-NEXT:    neg a5, a2
+; RV32-NEXT:    snez a6, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    bltz a3, .LBB21_16
+; RV32-NEXT:  .LBB21_15: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:  .LBB21_16: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB21_22
+; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB21_23
+; RV32-NEXT:  .LBB21_18:
+; RV32-NEXT:    neg a7, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    sub a0, t0, a0
+; RV32-NEXT:  .LBB21_19: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti t0, a3, 0
+; RV32-NEXT:    bltz a3, .LBB21_21
+; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:  .LBB21_21: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a5, a7
+; RV32-NEXT:    mul a3, a6, a7
+; RV32-NEXT:    mul a7, a5, a7
+; RV32-NEXT:    mul a6, a6, a0
+; RV32-NEXT:    mulhu t1, a5, a0
+; RV32-NEXT:    mul a0, a5, a0
+; RV32-NEXT:    xor a1, t0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a5, a7, a3
+; RV32-NEXT:    sltu a7, a0, a2
+; RV32-NEXT:    add a2, a5, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    sltu a5, a2, a1
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    sltu a0, a1, a5
+; RV32-NEXT:    xor a3, a6, a3
 ; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    sltu a1, t0, a1
-; RV32-NEXT:    xor a3, t0, a7
-; RV32-NEXT:    add a0, t1, a0
+; RV32-NEXT:    j .LBB21_26
+; RV32-NEXT:  .LBB21_22:
+; RV32-NEXT:    neg a7, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg t0, a1
+; RV32-NEXT:    sub a2, t0, a2
+; RV32-NEXT:  .LBB21_23: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti t0, a1, 0
+; RV32-NEXT:    bltz a1, .LBB21_25
+; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:  .LBB21_25: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a5, a7
+; RV32-NEXT:    mul a1, a6, a7
+; RV32-NEXT:    mul a7, a5, a7
+; RV32-NEXT:    mul a6, a6, a2
+; RV32-NEXT:    mulhu t1, a5, a2
+; RV32-NEXT:    mul a2, a5, a2
+; RV32-NEXT:    xor a3, a3, t0
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a7
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    neg a5, a3
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    xor a2, a7, a5
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    add a0, a6, a0
+; RV32-NEXT:    sltu a3, a2, a3
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    sltu a3, a1, a3
+; RV32-NEXT:    xor a0, a0, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:  .LBB21_26: # %overflow.res
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:  .LBB21_27: # %overflow.res
+; RV32-NEXT:    andi a0, a5, 1
 ; RV32-NEXT:    sw a2, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    sw a1, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.i64:
@@ -1359,38 +1494,173 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a6, a0, 31
+; RV32ZBA-NEXT:    srai a5, a2, 31
+; RV32ZBA-NEXT:    beq a1, a6, .LBB21_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a5, .LBB21_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mulhu a5, a0, a2
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a7, a1, a2
 ; RV32ZBA-NEXT:    mul t0, a3, a0
 ; RV32ZBA-NEXT:    mulh t1, a1, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    mul t2, a1, a3
 ; RV32ZBA-NEXT:    mulhsu a3, a3, a0
-; RV32ZBA-NEXT:    mul a2, a0, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a0, a5, a6
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    add a0, a7, a0
-; RV32ZBA-NEXT:    sltu a6, a5, t0
-; RV32ZBA-NEXT:    srai a7, a5, 31
+; RV32ZBA-NEXT:    add a1, a6, a5
+; RV32ZBA-NEXT:    sltu a5, a1, a6
+; RV32ZBA-NEXT:    add a1, t0, a1
+; RV32ZBA-NEXT:    add a5, a7, a5
+; RV32ZBA-NEXT:    sltu a6, a1, t0
 ; RV32ZBA-NEXT:    add a3, a3, a6
-; RV32ZBA-NEXT:    srai a6, a0, 31
-; RV32ZBA-NEXT:    add t0, a0, a3
-; RV32ZBA-NEXT:    srai a3, a3, 31
-; RV32ZBA-NEXT:    sltu a0, t0, a0
+; RV32ZBA-NEXT:    srai a6, a5, 31
+; RV32ZBA-NEXT:    srai a7, a3, 31
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    srai a7, a1, 31
+; RV32ZBA-NEXT:    add a3, a5, a3
+; RV32ZBA-NEXT:    sltu a5, a3, a5
+; RV32ZBA-NEXT:    add a3, t2, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a6, a3, t2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a5, t1, a5
+; RV32ZBA-NEXT:    add a5, a5, a6
+; RV32ZBA-NEXT:    xor a5, a5, a7
+; RV32ZBA-NEXT:    or a3, a3, a5
+; RV32ZBA-NEXT:    snez a5, a3
+; RV32ZBA-NEXT:    j .LBB21_9
+; RV32ZBA-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a5, .LBB21_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB21_10
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a0
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB21_11
+; RV32ZBA-NEXT:    j .LBB21_12
+; RV32ZBA-NEXT:  .LBB21_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB21_14
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a2
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB21_15
+; RV32ZBA-NEXT:    j .LBB21_16
+; RV32ZBA-NEXT:  .LBB21_8: # %overflow.no
+; RV32ZBA-NEXT:    li a5, 0
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    mul a3, a0, a3
 ; RV32ZBA-NEXT:    add a3, a6, a3
-; RV32ZBA-NEXT:    add t0, a1, t0
+; RV32ZBA-NEXT:    mul a1, a1, a2
+; RV32ZBA-NEXT:    add a1, a3, a1
+; RV32ZBA-NEXT:  .LBB21_9: # %overflow.res
+; RV32ZBA-NEXT:    mul a2, a0, a2
+; RV32ZBA-NEXT:    j .LBB21_27
+; RV32ZBA-NEXT:  .LBB21_10:
+; RV32ZBA-NEXT:    neg a5, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a6, a7, a6
+; RV32ZBA-NEXT:    bltz a1, .LBB21_12
+; RV32ZBA-NEXT:  .LBB21_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    mv a5, a0
+; RV32ZBA-NEXT:  .LBB21_12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB21_18
+; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB21_19
+; RV32ZBA-NEXT:  .LBB21_14:
+; RV32ZBA-NEXT:    neg a5, a2
+; RV32ZBA-NEXT:    snez a6, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a6, a7, a6
+; RV32ZBA-NEXT:    bltz a3, .LBB21_16
+; RV32ZBA-NEXT:  .LBB21_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    mv a5, a2
+; RV32ZBA-NEXT:  .LBB21_16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB21_22
+; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB21_23
+; RV32ZBA-NEXT:  .LBB21_18:
+; RV32ZBA-NEXT:    neg a7, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg t0, a3
+; RV32ZBA-NEXT:    sub a0, t0, a0
+; RV32ZBA-NEXT:  .LBB21_19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti t0, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB21_21
+; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:  .LBB21_21: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a5, a7
+; RV32ZBA-NEXT:    mul a3, a6, a7
+; RV32ZBA-NEXT:    mul a7, a5, a7
+; RV32ZBA-NEXT:    mul a6, a6, a0
+; RV32ZBA-NEXT:    mulhu t1, a5, a0
+; RV32ZBA-NEXT:    mul a0, a5, a0
+; RV32ZBA-NEXT:    xor a1, t0, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a6, t1, a6
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a5, a7, a3
+; RV32ZBA-NEXT:    sltu a7, a0, a2
+; RV32ZBA-NEXT:    add a2, a5, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    sltu a5, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a5
+; RV32ZBA-NEXT:    sltu a0, a1, a5
+; RV32ZBA-NEXT:    xor a3, a6, a3
 ; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    sltu a1, t0, a1
-; RV32ZBA-NEXT:    xor a3, t0, a7
-; RV32ZBA-NEXT:    add a0, t1, a0
+; RV32ZBA-NEXT:    j .LBB21_26
+; RV32ZBA-NEXT:  .LBB21_22:
+; RV32ZBA-NEXT:    neg a7, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg t0, a1
+; RV32ZBA-NEXT:    sub a2, t0, a2
+; RV32ZBA-NEXT:  .LBB21_23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti t0, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB21_25
+; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:  .LBB21_25: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a5, a7
+; RV32ZBA-NEXT:    mul a1, a6, a7
+; RV32ZBA-NEXT:    mul a7, a5, a7
+; RV32ZBA-NEXT:    mul a6, a6, a2
+; RV32ZBA-NEXT:    mulhu t1, a5, a2
+; RV32ZBA-NEXT:    mul a2, a5, a2
+; RV32ZBA-NEXT:    xor a3, a3, t0
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    xor a0, a0, a7
-; RV32ZBA-NEXT:    or a0, a3, a0
-; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    add a6, t1, a6
+; RV32ZBA-NEXT:    neg a5, a3
+; RV32ZBA-NEXT:    add a1, a0, a2
+; RV32ZBA-NEXT:    xor a2, a7, a5
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    add a0, a6, a0
+; RV32ZBA-NEXT:    sltu a3, a2, a3
+; RV32ZBA-NEXT:    add a1, a1, a3
+; RV32ZBA-NEXT:    sltu a3, a1, a3
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:  .LBB21_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:  .LBB21_27: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a5, 1
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
+; RV32ZBA-NEXT:    sw a1, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.i64:
@@ -1404,38 +1674,165 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a6, a0, 31
+; RV32ZICOND-NEXT:    srai a5, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a6, .LBB21_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a2
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a7, a1, a2
 ; RV32ZICOND-NEXT:    mul t0, a3, a0
 ; RV32ZICOND-NEXT:    mulh t1, a1, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mul t2, a1, a3
 ; RV32ZICOND-NEXT:    mulhsu a3, a3, a0
-; RV32ZICOND-NEXT:    mul a2, a0, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a0, a5, a6
-; RV32ZICOND-NEXT:    add a5, t0, a5
-; RV32ZICOND-NEXT:    add a0, a7, a0
-; RV32ZICOND-NEXT:    sltu a6, a5, t0
-; RV32ZICOND-NEXT:    srai a7, a5, 31
+; RV32ZICOND-NEXT:    add a1, a6, a5
+; RV32ZICOND-NEXT:    sltu a5, a1, a6
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a5, a7, a5
+; RV32ZICOND-NEXT:    sltu a6, a1, t0
 ; RV32ZICOND-NEXT:    add a3, a3, a6
-; RV32ZICOND-NEXT:    srai a6, a0, 31
-; RV32ZICOND-NEXT:    add t0, a0, a3
-; RV32ZICOND-NEXT:    srai a3, a3, 31
-; RV32ZICOND-NEXT:    sltu a0, t0, a0
-; RV32ZICOND-NEXT:    add a3, a6, a3
-; RV32ZICOND-NEXT:    add t0, a1, t0
+; RV32ZICOND-NEXT:    srai a6, a5, 31
+; RV32ZICOND-NEXT:    srai a7, a3, 31
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    srai a7, a1, 31
+; RV32ZICOND-NEXT:    add a3, a5, a3
+; RV32ZICOND-NEXT:    sltu a5, a3, a5
+; RV32ZICOND-NEXT:    add a3, t2, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a6, a3, t2
+; RV32ZICOND-NEXT:    xor a3, a3, a7
+; RV32ZICOND-NEXT:    add a5, t1, a5
+; RV32ZICOND-NEXT:    add a5, a5, a6
+; RV32ZICOND-NEXT:    xor a5, a5, a7
+; RV32ZICOND-NEXT:    or a3, a3, a5
+; RV32ZICOND-NEXT:    snez a5, a3
+; RV32ZICOND-NEXT:    j .LBB21_7
+; RV32ZICOND-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_6
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a5, a1, 0
+; RV32ZICOND-NEXT:    neg a6, a0
+; RV32ZICOND-NEXT:    snez a7, a0
+; RV32ZICOND-NEXT:    neg t0, a1
+; RV32ZICOND-NEXT:    snez t1, a2
+; RV32ZICOND-NEXT:    sub a7, t0, a7
+; RV32ZICOND-NEXT:    neg t0, a3
+; RV32ZICOND-NEXT:    sub t0, t0, t1
+; RV32ZICOND-NEXT:    slti t1, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a5
+; RV32ZICOND-NEXT:    or a6, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    or a0, a6, a0
+; RV32ZICOND-NEXT:    neg a6, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a5
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t1
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    xor a5, t1, a5
+; RV32ZICOND-NEXT:    or a6, a6, a2
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    or t0, t0, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    neg t1, a5
+; RV32ZICOND-NEXT:    or a2, a6, a2
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    or a3, t0, a3
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a7, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu t0, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a7, t1
+; RV32ZICOND-NEXT:    add a6, a6, a2
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a2, a3, a5
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    sltu a3, a2, a5
+; RV32ZICOND-NEXT:    sltu a5, a0, a6
+; RV32ZICOND-NEXT:    xor a0, a0, t1
+; RV32ZICOND-NEXT:    add a5, a1, a5
+; RV32ZICOND-NEXT:    add a1, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a1, a3
+; RV32ZICOND-NEXT:    xor a3, a5, t1
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, t0, a1
-; RV32ZICOND-NEXT:    xor a3, t0, a7
-; RV32ZICOND-NEXT:    add a0, t1, a0
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    xor a0, a0, a7
-; RV32ZICOND-NEXT:    or a0, a3, a0
-; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    j .LBB21_8
+; RV32ZICOND-NEXT:  .LBB21_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a5, a3, 0
+; RV32ZICOND-NEXT:    neg a6, a2
+; RV32ZICOND-NEXT:    snez a7, a2
+; RV32ZICOND-NEXT:    neg t0, a3
+; RV32ZICOND-NEXT:    snez t1, a0
+; RV32ZICOND-NEXT:    sub a7, t0, a7
+; RV32ZICOND-NEXT:    neg t0, a1
+; RV32ZICOND-NEXT:    sub t0, t0, t1
+; RV32ZICOND-NEXT:    slti t1, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a5
+; RV32ZICOND-NEXT:    or a6, a6, a2
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    or a2, a6, a2
+; RV32ZICOND-NEXT:    neg a6, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a5
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t1
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    xor a5, a5, t1
+; RV32ZICOND-NEXT:    or a6, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    or t0, t0, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    neg t1, a5
+; RV32ZICOND-NEXT:    or a0, a6, a0
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    or a1, t0, a1
+; RV32ZICOND-NEXT:    mulhu a6, a2, a0
+; RV32ZICOND-NEXT:    mul a7, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu t0, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a7, t1
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    add a3, t0, a3
+; RV32ZICOND-NEXT:    add a2, a2, a5
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a5, a2, a5
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t1
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a5
+; RV32ZICOND-NEXT:    sltu a3, a1, a5
+; RV32ZICOND-NEXT:    xor a0, a0, t1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    j .LBB21_8
+; RV32ZICOND-NEXT:  .LBB21_6: # %overflow.no
+; RV32ZICOND-NEXT:    li a5, 0
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a3, a6, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a2
+; RV32ZICOND-NEXT:    add a1, a3, a1
+; RV32ZICOND-NEXT:  .LBB21_7: # %overflow.res
+; RV32ZICOND-NEXT:    mul a2, a0, a2
+; RV32ZICOND-NEXT:  .LBB21_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a5, 1
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
+; RV32ZICOND-NEXT:    sw a1, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.i64:
@@ -1457,23 +1854,57 @@ entry:
 
 define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: smulo2.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mulhu a4, a0, a3
-; RV32-NEXT:    mul a5, a1, a3
-; RV32-NEXT:    mulh a1, a1, a3
-; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a0, a4, a5
-; RV32-NEXT:    srai a5, a4, 31
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    xor a1, a0, a5
-; RV32-NEXT:    srai a0, a0, 31
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a3, a0, 31
+; RV32-NEXT:    beq a1, a3, .LBB22_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    bltz a1, .LBB22_4
+; RV32-NEXT:  # %bb.2: # %overflow.lhs
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:    mv a4, a1
+; RV32-NEXT:    bgez a1, .LBB22_5
+; RV32-NEXT:    j .LBB22_6
+; RV32-NEXT:  .LBB22_3: # %overflow.no.lhs
+; RV32-NEXT:    li a4, 0
+; RV32-NEXT:    li a5, 13
+; RV32-NEXT:    mulhu a3, a0, a5
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a3, a3, a1
+; RV32-NEXT:    mul a1, a0, a5
+; RV32-NEXT:    j .LBB22_7
+; RV32-NEXT:  .LBB22_4:
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    sub a4, a5, a4
+; RV32-NEXT:    bltz a1, .LBB22_6
+; RV32-NEXT:  .LBB22_5: # %overflow.lhs
+; RV32-NEXT:    mv a4, a1
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:  .LBB22_6: # %overflow.lhs
+; RV32-NEXT:    li a0, 13
+; RV32-NEXT:    mul a5, a3, a0
+; RV32-NEXT:    mulhu a3, a3, a0
+; RV32-NEXT:    mulhu a6, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    srai a4, a1, 31
+; RV32-NEXT:    srli a7, a1, 31
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    xor a1, a5, a4
+; RV32-NEXT:    sltu a3, a0, a3
+; RV32-NEXT:    add a1, a1, a7
+; RV32-NEXT:    xor a0, a0, a4
+; RV32-NEXT:    add a6, a6, a3
+; RV32-NEXT:    sltu a5, a1, a7
+; RV32-NEXT:    add a3, a0, a5
+; RV32-NEXT:    sltu a0, a3, a5
+; RV32-NEXT:    xor a4, a6, a4
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:  .LBB22_7: # %overflow.res
+; RV32-NEXT:    andi a0, a4, 1
+; RV32-NEXT:    sw a1, 0(a2)
+; RV32-NEXT:    sw a3, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.i64:
@@ -1488,25 +1919,61 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a3, 13
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a3, a0, 31
+; RV32ZBA-NEXT:    beq a1, a3, .LBB22_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    bltz a1, .LBB22_4
+; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a3, a0
+; RV32ZBA-NEXT:    mv a4, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB22_5
+; RV32ZBA-NEXT:    j .LBB22_6
+; RV32ZBA-NEXT:  .LBB22_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    li a3, 0
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
-; RV32ZBA-NEXT:    sh1add a5, a0, a0
 ; RV32ZBA-NEXT:    sh2add a4, a4, a1
-; RV32ZBA-NEXT:    mulh a1, a1, a3
-; RV32ZBA-NEXT:    mulhu a3, a0, a3
-; RV32ZBA-NEXT:    sh2add a5, a5, a0
+; RV32ZBA-NEXT:    li a1, 13
+; RV32ZBA-NEXT:    mulhu a1, a0, a1
+; RV32ZBA-NEXT:    add a4, a1, a4
+; RV32ZBA-NEXT:    sh1add a1, a0, a0
+; RV32ZBA-NEXT:    sh2add a1, a1, a0
+; RV32ZBA-NEXT:    j .LBB22_7
+; RV32ZBA-NEXT:  .LBB22_4:
+; RV32ZBA-NEXT:    neg a3, a0
+; RV32ZBA-NEXT:    snez a4, a0
+; RV32ZBA-NEXT:    neg a5, a1
+; RV32ZBA-NEXT:    sub a4, a5, a4
+; RV32ZBA-NEXT:    bltz a1, .LBB22_6
+; RV32ZBA-NEXT:  .LBB22_5: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a4, a1
+; RV32ZBA-NEXT:    mv a3, a0
+; RV32ZBA-NEXT:  .LBB22_6: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a0, a3, a3
+; RV32ZBA-NEXT:    li a5, 13
+; RV32ZBA-NEXT:    sh1add a6, a4, a4
+; RV32ZBA-NEXT:    sh2add a0, a0, a3
+; RV32ZBA-NEXT:    mulhu a3, a3, a5
+; RV32ZBA-NEXT:    sh2add a6, a6, a4
+; RV32ZBA-NEXT:    mulhu a4, a4, a5
+; RV32ZBA-NEXT:    srai a5, a1, 31
+; RV32ZBA-NEXT:    srli a7, a1, 31
+; RV32ZBA-NEXT:    add a6, a3, a6
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    sltu a3, a6, a3
+; RV32ZBA-NEXT:    add a1, a0, a7
+; RV32ZBA-NEXT:    xor a0, a6, a5
 ; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    sltu a0, a3, a4
-; RV32ZBA-NEXT:    srai a4, a3, 31
-; RV32ZBA-NEXT:    add a0, a1, a0
-; RV32ZBA-NEXT:    xor a1, a0, a4
-; RV32ZBA-NEXT:    srai a0, a0, 31
-; RV32ZBA-NEXT:    xor a0, a0, a4
-; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    sw a5, 0(a2)
-; RV32ZBA-NEXT:    sw a3, 4(a2)
+; RV32ZBA-NEXT:    sltu a6, a1, a7
+; RV32ZBA-NEXT:    add a4, a0, a6
+; RV32ZBA-NEXT:    sltu a0, a4, a6
+; RV32ZBA-NEXT:    xor a3, a3, a5
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:  .LBB22_7: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a3, 1
+; RV32ZBA-NEXT:    sw a1, 0(a2)
+; RV32ZBA-NEXT:    sw a4, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.i64:
@@ -1522,23 +1989,56 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    li a3, 13
-; RV32ZICOND-NEXT:    mulhu a4, a0, a3
-; RV32ZICOND-NEXT:    mul a5, a1, a3
-; RV32ZICOND-NEXT:    mulh a1, a1, a3
-; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a0, a4, a5
-; RV32ZICOND-NEXT:    srai a5, a4, 31
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a0, a5
-; RV32ZICOND-NEXT:    srai a0, a0, 31
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a3, a0, 31
+; RV32ZICOND-NEXT:    beq a1, a3, .LBB22_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    slti a3, a1, 0
+; RV32ZICOND-NEXT:    neg a4, a0
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    neg a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a3
+; RV32ZICOND-NEXT:    sub a5, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    or a4, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a3
+; RV32ZICOND-NEXT:    or a5, a5, a6
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a5, a3
+; RV32ZICOND-NEXT:    li a5, 13
+; RV32ZICOND-NEXT:    or a0, a4, a0
+; RV32ZICOND-NEXT:    or a3, a3, a6
+; RV32ZICOND-NEXT:    mul a4, a0, a5
+; RV32ZICOND-NEXT:    mulhu a0, a0, a5
+; RV32ZICOND-NEXT:    mulhu a6, a3, a5
+; RV32ZICOND-NEXT:    mul a3, a3, a5
+; RV32ZICOND-NEXT:    srai a5, a1, 31
+; RV32ZICOND-NEXT:    srli a7, a1, 31
+; RV32ZICOND-NEXT:    xor a1, a4, a5
+; RV32ZICOND-NEXT:    add a3, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a7
+; RV32ZICOND-NEXT:    sltu a0, a3, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a7
+; RV32ZICOND-NEXT:    xor a3, a3, a5
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a4, a3, a4
 ; RV32ZICOND-NEXT:    xor a0, a0, a5
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    add a0, a0, a4
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    j .LBB22_3
+; RV32ZICOND-NEXT:  .LBB22_2: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:    li a5, 13
+; RV32ZICOND-NEXT:    mulhu a3, a0, a5
+; RV32ZICOND-NEXT:    mul a1, a1, a5
+; RV32ZICOND-NEXT:    add a3, a3, a1
+; RV32ZICOND-NEXT:    mul a1, a0, a5
+; RV32ZICOND-NEXT:  .LBB22_3: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a4, 1
+; RV32ZICOND-NEXT:    sw a1, 0(a2)
+; RV32ZICOND-NEXT:    sw a3, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.i64:
@@ -1766,26 +2266,71 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 
 define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: umulo.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB26_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB26_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a5, a3, a0
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhu a7, a0, a2
 ; RV32-NEXT:    snez t0, a3
-; RV32-NEXT:    mulhu a3, a3, a0
-; RV32-NEXT:    mul t1, a0, a2
-; RV32-NEXT:    mulhu a0, a1, a2
-; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    and a1, a1, t0
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    snez a2, a3
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    sltu a1, a5, a7
-; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    sw t1, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    snez a6, a1
+; RV32-NEXT:    mulhu a1, a1, a2
+; RV32-NEXT:    mulhu a3, a3, a0
+; RV32-NEXT:    and a6, a6, t0
+; RV32-NEXT:    snez t0, a1
+; RV32-NEXT:    snez a3, a3
+; RV32-NEXT:    add a1, a7, a5
+; RV32-NEXT:    or a5, a6, t0
+; RV32-NEXT:    sltu a6, a1, a7
+; RV32-NEXT:    or a3, a5, a3
+; RV32-NEXT:    or a6, a3, a6
+; RV32-NEXT:    j .LBB26_7
+; RV32-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB26_6
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    mul a7, a1, a2
+; RV32-NEXT:    mul a5, a0, a2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mulhu a2, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    mul a1, a0, a3
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    sltu a0, a1, a6
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    j .LBB26_8
+; RV32-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a6, a2, a0
+; RV32-NEXT:    mul a7, a3, a0
+; RV32-NEXT:    mul a5, a2, a0
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mulhu a0, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    sltu a2, a1, a6
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    j .LBB26_8
+; RV32-NEXT:  .LBB26_6: # %overflow.no
+; RV32-NEXT:    li a6, 0
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:  .LBB26_7: # %overflow.res
+; RV32-NEXT:    mul a5, a0, a2
+; RV32-NEXT:  .LBB26_8: # %overflow.res
+; RV32-NEXT:    andi a0, a6, 1
+; RV32-NEXT:    sw a5, 0(a4)
+; RV32-NEXT:    sw a1, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
@@ -1798,26 +2343,71 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB26_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB26_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a5, a3, a0
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhu a7, a0, a2
 ; RV32ZBA-NEXT:    snez t0, a3
-; RV32ZBA-NEXT:    mulhu a3, a3, a0
-; RV32ZBA-NEXT:    mul t1, a0, a2
-; RV32ZBA-NEXT:    mulhu a0, a1, a2
-; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    and a1, a1, t0
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    snez a2, a3
-; RV32ZBA-NEXT:    add a5, a7, a5
-; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    sltu a1, a5, a7
-; RV32ZBA-NEXT:    or a0, a0, a2
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    sw t1, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
+; RV32ZBA-NEXT:    snez a6, a1
+; RV32ZBA-NEXT:    mulhu a1, a1, a2
+; RV32ZBA-NEXT:    mulhu a3, a3, a0
+; RV32ZBA-NEXT:    and a6, a6, t0
+; RV32ZBA-NEXT:    snez t0, a1
+; RV32ZBA-NEXT:    snez a3, a3
+; RV32ZBA-NEXT:    add a1, a7, a5
+; RV32ZBA-NEXT:    or a5, a6, t0
+; RV32ZBA-NEXT:    sltu a6, a1, a7
+; RV32ZBA-NEXT:    or a3, a5, a3
+; RV32ZBA-NEXT:    or a6, a3, a6
+; RV32ZBA-NEXT:    j .LBB26_7
+; RV32ZBA-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB26_6
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    mul a7, a1, a2
+; RV32ZBA-NEXT:    mul a5, a0, a2
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mulhu a2, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a2, a2, a1
+; RV32ZBA-NEXT:    mul a1, a0, a3
+; RV32ZBA-NEXT:    add a1, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a6
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    j .LBB26_8
+; RV32ZBA-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a6, a2, a0
+; RV32ZBA-NEXT:    mul a7, a3, a0
+; RV32ZBA-NEXT:    mul a5, a2, a0
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mulhu a0, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a6, a1
+; RV32ZBA-NEXT:    sltu a2, a1, a6
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    j .LBB26_8
+; RV32ZBA-NEXT:  .LBB26_6: # %overflow.no
+; RV32ZBA-NEXT:    li a6, 0
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    mul a3, a0, a3
+; RV32ZBA-NEXT:    add a3, a5, a3
+; RV32ZBA-NEXT:    mul a1, a1, a2
+; RV32ZBA-NEXT:    add a1, a3, a1
+; RV32ZBA-NEXT:  .LBB26_7: # %overflow.res
+; RV32ZBA-NEXT:    mul a5, a0, a2
+; RV32ZBA-NEXT:  .LBB26_8: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a6, 1
+; RV32ZBA-NEXT:    sw a5, 0(a4)
+; RV32ZBA-NEXT:    sw a1, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
@@ -1830,26 +2420,71 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB26_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB26_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a5, a3, a0
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a7, a0, a2
 ; RV32ZICOND-NEXT:    snez t0, a3
-; RV32ZICOND-NEXT:    mulhu a3, a3, a0
-; RV32ZICOND-NEXT:    mul t1, a0, a2
-; RV32ZICOND-NEXT:    mulhu a0, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
 ; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    and a1, a1, t0
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    snez a2, a3
-; RV32ZICOND-NEXT:    add a5, a7, a5
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    sltu a1, a5, a7
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    sw t1, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
+; RV32ZICOND-NEXT:    snez a6, a1
+; RV32ZICOND-NEXT:    mulhu a1, a1, a2
+; RV32ZICOND-NEXT:    mulhu a3, a3, a0
+; RV32ZICOND-NEXT:    and a6, a6, t0
+; RV32ZICOND-NEXT:    snez t0, a1
+; RV32ZICOND-NEXT:    snez a3, a3
+; RV32ZICOND-NEXT:    add a1, a7, a5
+; RV32ZICOND-NEXT:    or a5, a6, t0
+; RV32ZICOND-NEXT:    sltu a6, a1, a7
+; RV32ZICOND-NEXT:    or a3, a5, a3
+; RV32ZICOND-NEXT:    or a6, a3, a6
+; RV32ZICOND-NEXT:    j .LBB26_7
+; RV32ZICOND-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB26_6
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a7, a1, a2
+; RV32ZICOND-NEXT:    mul a5, a0, a2
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu a2, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a2, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a0, a3
+; RV32ZICOND-NEXT:    add a1, a6, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a6
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    j .LBB26_8
+; RV32ZICOND-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a6, a2, a0
+; RV32ZICOND-NEXT:    mul a7, a3, a0
+; RV32ZICOND-NEXT:    mul a5, a2, a0
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu a0, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a6, a1
+; RV32ZICOND-NEXT:    sltu a2, a1, a6
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    j .LBB26_8
+; RV32ZICOND-NEXT:  .LBB26_6: # %overflow.no
+; RV32ZICOND-NEXT:    li a6, 0
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a3, a5, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a2
+; RV32ZICOND-NEXT:    add a1, a3, a1
+; RV32ZICOND-NEXT:  .LBB26_7: # %overflow.res
+; RV32ZICOND-NEXT:    mul a5, a0, a2
+; RV32ZICOND-NEXT:  .LBB26_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a6, 1
+; RV32ZICOND-NEXT:    sw a5, 0(a4)
+; RV32ZICOND-NEXT:    sw a1, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
@@ -1870,18 +2505,30 @@ entry:
 
 define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: umulo2.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB27_2
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    li a4, 13
+; RV32-NEXT:    mul a3, a0, a4
+; RV32-NEXT:    mulhu a0, a0, a4
+; RV32-NEXT:    mulhu a5, a1, a4
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:    j .LBB27_3
+; RV32-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32-NEXT:    li a4, 0
 ; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a4, a1, a3
 ; RV32-NEXT:    mulhu a5, a0, a3
-; RV32-NEXT:    mulhu a1, a1, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    snez a0, a1
-; RV32-NEXT:    sltu a1, a4, a5
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:  .LBB27_3: # %overflow.res
+; RV32-NEXT:    andi a0, a4, 1
 ; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32-NEXT:    sw a1, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
@@ -1895,20 +2542,34 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a3, 13
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB27_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a3, a0, a0
+; RV32ZBA-NEXT:    li a5, 13
+; RV32ZBA-NEXT:    sh1add a6, a1, a1
+; RV32ZBA-NEXT:    sh2add a4, a3, a0
+; RV32ZBA-NEXT:    mulhu a0, a0, a5
+; RV32ZBA-NEXT:    mulhu a3, a1, a5
+; RV32ZBA-NEXT:    sh2add a1, a6, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:    j .LBB27_3
+; RV32ZBA-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32ZBA-NEXT:    li a3, 0
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
-; RV32ZBA-NEXT:    sh1add a5, a0, a0
-; RV32ZBA-NEXT:    sh2add a4, a4, a1
-; RV32ZBA-NEXT:    mulhu a1, a1, a3
-; RV32ZBA-NEXT:    mulhu a3, a0, a3
-; RV32ZBA-NEXT:    sh2add a5, a5, a0
-; RV32ZBA-NEXT:    add a4, a3, a4
-; RV32ZBA-NEXT:    snez a0, a1
-; RV32ZBA-NEXT:    sltu a1, a4, a3
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    sw a5, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
+; RV32ZBA-NEXT:    sh2add a1, a4, a1
+; RV32ZBA-NEXT:    li a4, 13
+; RV32ZBA-NEXT:    mulhu a4, a0, a4
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    sh1add a4, a0, a0
+; RV32ZBA-NEXT:    sh2add a4, a4, a0
+; RV32ZBA-NEXT:  .LBB27_3: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a3, 1
+; RV32ZBA-NEXT:    sw a4, 0(a2)
+; RV32ZBA-NEXT:    sw a1, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
@@ -1923,18 +2584,30 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB27_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    li a4, 13
+; RV32ZICOND-NEXT:    mul a3, a0, a4
+; RV32ZICOND-NEXT:    mulhu a0, a0, a4
+; RV32ZICOND-NEXT:    mulhu a5, a1, a4
+; RV32ZICOND-NEXT:    mul a1, a1, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    j .LBB27_3
+; RV32ZICOND-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    li a4, 0
 ; RV32ZICOND-NEXT:    li a3, 13
-; RV32ZICOND-NEXT:    mul a4, a1, a3
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a3
-; RV32ZICOND-NEXT:    mulhu a1, a1, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a5, a1
 ; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    snez a0, a1
-; RV32ZICOND-NEXT:    sltu a1, a4, a5
-; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB27_3: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a4, 1
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    sw a1, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
@@ -3218,7 +3891,13 @@ entry:
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.select.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB46_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB46_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a6, a1, a2
@@ -3246,11 +3925,119 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a5, a5, a4
 ; RV32-NEXT:    xor a4, a6, a4
 ; RV32-NEXT:    or a4, a4, a5
-; RV32-NEXT:    bnez a4, .LBB46_2
-; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    j .LBB46_26
+; RV32-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB46_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB46_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB46_10
+; RV32-NEXT:    j .LBB46_11
+; RV32-NEXT:  .LBB46_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB46_13
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB46_14
+; RV32-NEXT:    j .LBB46_15
+; RV32-NEXT:  .LBB46_8: # %overflow.no
+; RV32-NEXT:    j .LBB46_27
+; RV32-NEXT:  .LBB46_9:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB46_11
+; RV32-NEXT:  .LBB46_10: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB46_11: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB46_17
+; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    j .LBB46_18
+; RV32-NEXT:  .LBB46_13:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB46_15
+; RV32-NEXT:  .LBB46_14: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB46_15: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB46_21
+; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    j .LBB46_22
+; RV32-NEXT:  .LBB46_17:
+; RV32-NEXT:    neg a7, a2
+; RV32-NEXT:    snez a6, a2
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    sub a6, t0, a6
+; RV32-NEXT:  .LBB46_18: # %overflow.no.lhs.only
+; RV32-NEXT:    slti t0, a1, 0
+; RV32-NEXT:    slti t1, a3, 0
+; RV32-NEXT:    bltz a3, .LBB46_20
+; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:  .LBB46_20: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu t2, a4, a7
+; RV32-NEXT:    mul t3, a5, a7
+; RV32-NEXT:    mul a7, a4, a7
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    mulhu t4, a4, a6
+; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    xor a6, t1, t0
+; RV32-NEXT:    j .LBB46_25
+; RV32-NEXT:  .LBB46_21:
+; RV32-NEXT:    neg a7, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    neg t0, a1
+; RV32-NEXT:    sub a6, t0, a6
+; RV32-NEXT:  .LBB46_22: # %overflow.no.rhs.only
+; RV32-NEXT:    slti t0, a3, 0
+; RV32-NEXT:    slti t1, a1, 0
+; RV32-NEXT:    bltz a1, .LBB46_24
+; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:  .LBB46_24: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu t2, a4, a7
+; RV32-NEXT:    mul t3, a5, a7
+; RV32-NEXT:    mul a7, a4, a7
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    mulhu t4, a4, a6
+; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    xor a6, t0, t1
+; RV32-NEXT:  .LBB46_25: # %overflow.res
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add a5, t4, a5
+; RV32-NEXT:    neg t0, a6
+; RV32-NEXT:    add a4, t2, a4
+; RV32-NEXT:    xor a7, a7, t0
+; RV32-NEXT:    sltu t1, a4, t2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    xor a4, a4, t0
+; RV32-NEXT:    add a5, a5, t1
+; RV32-NEXT:    sltu a6, a7, a6
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    xor a5, a5, t0
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:  .LBB46_26: # %overflow.res
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    bnez a4, .LBB46_28
+; RV32-NEXT:  .LBB46_27: # %overflow.res
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB46_2: # %entry
+; RV32-NEXT:  .LBB46_28: # %overflow.res
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.select.i64:
@@ -3265,7 +4052,13 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.select.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB46_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB46_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a6, a1, a2
@@ -3293,11 +4086,119 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a5, a5, a4
 ; RV32ZBA-NEXT:    xor a4, a6, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
-; RV32ZBA-NEXT:    bnez a4, .LBB46_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
+; RV32ZBA-NEXT:    j .LBB46_26
+; RV32ZBA-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB46_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB46_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB46_10
+; RV32ZBA-NEXT:    j .LBB46_11
+; RV32ZBA-NEXT:  .LBB46_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB46_13
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB46_14
+; RV32ZBA-NEXT:    j .LBB46_15
+; RV32ZBA-NEXT:  .LBB46_8: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB46_27
+; RV32ZBA-NEXT:  .LBB46_9:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB46_11
+; RV32ZBA-NEXT:  .LBB46_10: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB46_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB46_17
+; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    j .LBB46_18
+; RV32ZBA-NEXT:  .LBB46_13:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB46_15
+; RV32ZBA-NEXT:  .LBB46_14: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB46_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB46_21
+; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    j .LBB46_22
+; RV32ZBA-NEXT:  .LBB46_17:
+; RV32ZBA-NEXT:    neg a7, a2
+; RV32ZBA-NEXT:    snez a6, a2
+; RV32ZBA-NEXT:    neg t0, a3
+; RV32ZBA-NEXT:    sub a6, t0, a6
+; RV32ZBA-NEXT:  .LBB46_18: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti t0, a1, 0
+; RV32ZBA-NEXT:    slti t1, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB46_20
+; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:  .LBB46_20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu t2, a4, a7
+; RV32ZBA-NEXT:    mul t3, a5, a7
+; RV32ZBA-NEXT:    mul a7, a4, a7
+; RV32ZBA-NEXT:    mul a5, a5, a6
+; RV32ZBA-NEXT:    mulhu t4, a4, a6
+; RV32ZBA-NEXT:    mul a4, a4, a6
+; RV32ZBA-NEXT:    xor a6, t1, t0
+; RV32ZBA-NEXT:    j .LBB46_25
+; RV32ZBA-NEXT:  .LBB46_21:
+; RV32ZBA-NEXT:    neg a7, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    neg t0, a1
+; RV32ZBA-NEXT:    sub a6, t0, a6
+; RV32ZBA-NEXT:  .LBB46_22: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti t0, a3, 0
+; RV32ZBA-NEXT:    slti t1, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB46_24
+; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:  .LBB46_24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu t2, a4, a7
+; RV32ZBA-NEXT:    mul t3, a5, a7
+; RV32ZBA-NEXT:    mul a7, a4, a7
+; RV32ZBA-NEXT:    mul a5, a5, a6
+; RV32ZBA-NEXT:    mulhu t4, a4, a6
+; RV32ZBA-NEXT:    mul a4, a4, a6
+; RV32ZBA-NEXT:    xor a6, t0, t1
+; RV32ZBA-NEXT:  .LBB46_25: # %overflow.res
+; RV32ZBA-NEXT:    add t2, t2, t3
+; RV32ZBA-NEXT:    add a5, t4, a5
+; RV32ZBA-NEXT:    neg t0, a6
+; RV32ZBA-NEXT:    add a4, t2, a4
+; RV32ZBA-NEXT:    xor a7, a7, t0
+; RV32ZBA-NEXT:    sltu t1, a4, t2
+; RV32ZBA-NEXT:    add a7, a7, a6
+; RV32ZBA-NEXT:    xor a4, a4, t0
+; RV32ZBA-NEXT:    add a5, a5, t1
+; RV32ZBA-NEXT:    sltu a6, a7, a6
+; RV32ZBA-NEXT:    add a4, a4, a6
+; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    xor a5, a5, t0
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:  .LBB46_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a4, a4
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    bnez a4, .LBB46_28
+; RV32ZBA-NEXT:  .LBB46_27: # %overflow.res
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB46_2: # %entry
+; RV32ZBA-NEXT:  .LBB46_28: # %overflow.res
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.select.i64:
@@ -3312,7 +4213,13 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB46_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a6, a1, a2
@@ -3335,11 +4242,99 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    srai a4, a4, 31
 ; RV32ZICOND-NEXT:    add a6, a7, a6
 ; RV32ZICOND-NEXT:    sltu a7, a6, a7
-; RV32ZICOND-NEXT:    xor a6, a6, a4
 ; RV32ZICOND-NEXT:    add a5, t0, a5
 ; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    xor a4, a5, a4
-; RV32ZICOND-NEXT:    or a4, a6, a4
+; RV32ZICOND-NEXT:    xor a5, a5, a4
+; RV32ZICOND-NEXT:    xor a4, a6, a4
+; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    j .LBB46_7
+; RV32ZICOND-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_8
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    neg t1, a2
+; RV32ZICOND-NEXT:    snez t2, a2
+; RV32ZICOND-NEXT:    neg t3, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez t4, a0, a4
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    czero.nez a7, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    sub t2, t3, t2
+; RV32ZICOND-NEXT:    czero.nez t3, a2, t0
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.nez t4, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    j .LBB46_6
+; RV32ZICOND-NEXT:  .LBB46_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    neg t1, a0
+; RV32ZICOND-NEXT:    snez t2, a0
+; RV32ZICOND-NEXT:    neg t3, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez t4, a2, a4
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    czero.nez a7, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    sub t2, t3, t2
+; RV32ZICOND-NEXT:    czero.nez t3, a0, t0
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.nez t4, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:  .LBB46_6: # %overflow.res
+; RV32ZICOND-NEXT:    or t1, t1, t3
+; RV32ZICOND-NEXT:    czero.eqz t2, t2, t0
+; RV32ZICOND-NEXT:    or t2, t2, t4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    czero.eqz t0, t2, t0
+; RV32ZICOND-NEXT:    or t1, t1, t3
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    or a7, t0, t4
+; RV32ZICOND-NEXT:    mulhu t0, a5, t1
+; RV32ZICOND-NEXT:    mul t2, a5, t1
+; RV32ZICOND-NEXT:    mul t1, a6, t1
+; RV32ZICOND-NEXT:    mul a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu t3, a5, a7
+; RV32ZICOND-NEXT:    mul a5, a5, a7
+; RV32ZICOND-NEXT:    neg a7, a4
+; RV32ZICOND-NEXT:    xor t2, t2, a7
+; RV32ZICOND-NEXT:    add t0, t0, t1
+; RV32ZICOND-NEXT:    add a6, t3, a6
+; RV32ZICOND-NEXT:    add t2, t2, a4
+; RV32ZICOND-NEXT:    add a5, t0, a5
+; RV32ZICOND-NEXT:    sltu a4, t2, a4
+; RV32ZICOND-NEXT:    sltu t0, a5, t0
+; RV32ZICOND-NEXT:    xor a5, a5, a7
+; RV32ZICOND-NEXT:    add a6, a6, t0
+; RV32ZICOND-NEXT:    add a5, a5, a4
+; RV32ZICOND-NEXT:    sltu a4, a5, a4
+; RV32ZICOND-NEXT:    xor a5, a6, a7
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:  .LBB46_7: # %overflow.res
+; RV32ZICOND-NEXT:    snez a4, a4
+; RV32ZICOND-NEXT:    j .LBB46_9
+; RV32ZICOND-NEXT:  .LBB46_8: # %overflow.no
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:  .LBB46_9: # %overflow.res
+; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -3367,7 +4362,13 @@ entry:
 
 define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.not.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB47_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB47_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -3395,27 +4396,154 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    j .LBB47_25
+; RV32-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB47_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB47_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB47_10
+; RV32-NEXT:    j .LBB47_11
+; RV32-NEXT:  .LBB47_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB47_13
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB47_14
+; RV32-NEXT:    j .LBB47_15
+; RV32-NEXT:  .LBB47_8: # %overflow.no
+; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:    ret
-;
-; RV64-LABEL: smulo.not.i64:
-; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    mulh a2, a0, a1
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    xor a0, a2, a0
-; RV64-NEXT:    seqz a0, a0
-; RV64-NEXT:    ret
-;
-; RV32ZBA-LABEL: smulo.not.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    mulhu a4, a0, a2
-; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    mulhsu a2, a1, a2
-; RV32ZBA-NEXT:    mul a6, a3, a0
-; RV32ZBA-NEXT:    mulhsu a0, a3, a0
-; RV32ZBA-NEXT:    mulh a7, a1, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32-NEXT:  .LBB47_9:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB47_11
+; RV32-NEXT:  .LBB47_10: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB47_11: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB47_17
+; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB47_18
+; RV32-NEXT:  .LBB47_13:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB47_15
+; RV32-NEXT:  .LBB47_14: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB47_15: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB47_21
+; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB47_22
+; RV32-NEXT:  .LBB47_17:
+; RV32-NEXT:    neg a6, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a0, a7, a0
+; RV32-NEXT:  .LBB47_18: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti a7, a3, 0
+; RV32-NEXT:    bltz a3, .LBB47_20
+; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:  .LBB47_20: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a4, a6
+; RV32-NEXT:    mul a3, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a0
+; RV32-NEXT:    mulhu t0, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    xor a1, a7, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a4, a6, a3
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    add a4, a4, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    j .LBB47_25
+; RV32-NEXT:  .LBB47_21:
+; RV32-NEXT:    neg a6, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:  .LBB47_22: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti a7, a1, 0
+; RV32-NEXT:    bltz a1, .LBB47_24
+; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:  .LBB47_24: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a4, a6
+; RV32-NEXT:    mul a1, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a2
+; RV32-NEXT:    mulhu t0, a4, a2
+; RV32-NEXT:    mul a2, a4, a2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a1, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a4, a6, a1
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a4, a4, a3
+; RV32-NEXT:    xor a2, a2, a1
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    sltu a3, a4, a3
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sltu a2, a2, a3
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:  .LBB47_25: # %overflow.res
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: smulo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+;
+; RV32ZBA-LABEL: smulo.not.i64:
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB47_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB47_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a5, a1, a2
+; RV32ZBA-NEXT:    mulhsu a2, a1, a2
+; RV32ZBA-NEXT:    mul a6, a3, a0
+; RV32ZBA-NEXT:    mulhsu a0, a3, a0
+; RV32ZBA-NEXT:    mulh a7, a1, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
 ; RV32ZBA-NEXT:    add a4, a5, a4
 ; RV32ZBA-NEXT:    sltu a3, a4, a5
 ; RV32ZBA-NEXT:    add a4, a6, a4
@@ -3436,7 +4564,128 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    seqz a0, a0
+; RV32ZBA-NEXT:    j .LBB47_25
+; RV32ZBA-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB47_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB47_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB47_10
+; RV32ZBA-NEXT:    j .LBB47_11
+; RV32ZBA-NEXT:  .LBB47_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB47_13
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB47_14
+; RV32ZBA-NEXT:    j .LBB47_15
+; RV32ZBA-NEXT:  .LBB47_8: # %overflow.no
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB47_9:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB47_11
+; RV32ZBA-NEXT:  .LBB47_10: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB47_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB47_17
+; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB47_18
+; RV32ZBA-NEXT:  .LBB47_13:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB47_15
+; RV32ZBA-NEXT:  .LBB47_14: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB47_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB47_21
+; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB47_22
+; RV32ZBA-NEXT:  .LBB47_17:
+; RV32ZBA-NEXT:    neg a6, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a0, a7, a0
+; RV32ZBA-NEXT:  .LBB47_18: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti a7, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB47_20
+; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:  .LBB47_20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a4, a6
+; RV32ZBA-NEXT:    mul a3, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a0
+; RV32ZBA-NEXT:    mulhu t0, a4, a0
+; RV32ZBA-NEXT:    mul a0, a4, a0
+; RV32ZBA-NEXT:    xor a1, a7, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a4, a6, a3
+; RV32ZBA-NEXT:    sltu a2, a0, a2
+; RV32ZBA-NEXT:    add a4, a4, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a2, a5, a2
+; RV32ZBA-NEXT:    sltu a1, a4, a1
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    xor a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    j .LBB47_25
+; RV32ZBA-NEXT:  .LBB47_21:
+; RV32ZBA-NEXT:    neg a6, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a2, a7, a2
+; RV32ZBA-NEXT:  .LBB47_22: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti a7, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB47_24
+; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:  .LBB47_24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a4, a6
+; RV32ZBA-NEXT:    mul a1, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a2
+; RV32ZBA-NEXT:    mulhu t0, a4, a2
+; RV32ZBA-NEXT:    mul a2, a4, a2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a1, a3
+; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a4, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    add a4, a4, a3
+; RV32ZBA-NEXT:    xor a2, a2, a1
+; RV32ZBA-NEXT:    add a0, a5, a0
+; RV32ZBA-NEXT:    sltu a3, a4, a3
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    sltu a2, a2, a3
+; RV32ZBA-NEXT:    xor a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:  .LBB47_25: # %overflow.res
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.not.i64:
@@ -3449,7 +4698,13 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB47_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -3477,7 +4732,120 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    seqz a0, a0
+; RV32ZICOND-NEXT:    j .LBB47_6
+; RV32ZICOND-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    snez t0, a2
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    or a1, a6, a1
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a6, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu a7, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a6, t0
+; RV32ZICOND-NEXT:    add a2, a5, a2
+; RV32ZICOND-NEXT:    add a1, a7, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a0, a3
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB47_6
+; RV32ZICOND-NEXT:  .LBB47_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    snez t0, a0
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    or a3, a6, a3
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    mulhu a5, a2, a0
+; RV32ZICOND-NEXT:    mul a6, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu a7, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a6, t0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a3, a7, a3
+; RV32ZICOND-NEXT:    add a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a2, a2, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a1, a1, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB47_6: # %overflow.res
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB47_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.not.i64:
@@ -3617,7 +4985,11 @@ entry:
 
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.select.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB50_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB50_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    snez a6, a3
@@ -3634,12 +5006,42 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    or a4, a5, a4
-; RV32-NEXT:    bnez a4, .LBB50_2
-; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    beqz a4, .LBB50_7
+; RV32-NEXT:    j .LBB50_8
+; RV32-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB50_9
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a5, a1, a2
+; RV32-NEXT:    mulhu a6, a0, a3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    mul a5, a1, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    mul a6, a0, a3
+; RV32-NEXT:    j .LBB50_6
+; RV32-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a5, a3, a0
+; RV32-NEXT:    mulhu a6, a2, a1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    mul a5, a3, a1
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    mul a6, a2, a1
+; RV32-NEXT:  .LBB50_6: # %overflow.res
+; RV32-NEXT:    add a6, a4, a6
+; RV32-NEXT:    sltu a4, a6, a4
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    bnez a4, .LBB50_8
+; RV32-NEXT:  .LBB50_7: # %overflow.res
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB50_2: # %entry
+; RV32-NEXT:  .LBB50_8: # %overflow.res
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB50_9: # %overflow.no
+; RV32-NEXT:    j .LBB50_7
 ;
 ; RV64-LABEL: umulo.select.i64:
 ; RV64:       # %bb.0: # %entry
@@ -3651,7 +5053,11 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.select.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB50_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB50_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    snez a6, a3
@@ -3668,12 +5074,42 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
 ; RV32ZBA-NEXT:    or a4, a5, a4
-; RV32ZBA-NEXT:    bnez a4, .LBB50_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    beqz a4, .LBB50_7
+; RV32ZBA-NEXT:    j .LBB50_8
+; RV32ZBA-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB50_9
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a5, a1, a2
+; RV32ZBA-NEXT:    mulhu a6, a0, a3
+; RV32ZBA-NEXT:    add a4, a4, a5
+; RV32ZBA-NEXT:    mul a5, a1, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    mul a6, a0, a3
+; RV32ZBA-NEXT:    j .LBB50_6
+; RV32ZBA-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a5, a3, a0
+; RV32ZBA-NEXT:    mulhu a6, a2, a1
+; RV32ZBA-NEXT:    add a4, a4, a5
+; RV32ZBA-NEXT:    mul a5, a3, a1
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    mul a6, a2, a1
+; RV32ZBA-NEXT:  .LBB50_6: # %overflow.res
+; RV32ZBA-NEXT:    add a6, a4, a6
+; RV32ZBA-NEXT:    sltu a4, a6, a4
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    snez a4, a4
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    bnez a4, .LBB50_8
+; RV32ZBA-NEXT:  .LBB50_7: # %overflow.res
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB50_2: # %entry
+; RV32ZBA-NEXT:  .LBB50_8: # %overflow.res
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB50_9: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB50_7
 ;
 ; RV64ZBA-LABEL: umulo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -3685,7 +5121,11 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB50_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB50_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    snez a6, a3
@@ -3702,6 +5142,36 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
 ; RV32ZICOND-NEXT:    or a4, a5, a4
+; RV32ZICOND-NEXT:    j .LBB50_8
+; RV32ZICOND-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB50_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a5, a1, a2
+; RV32ZICOND-NEXT:    mulhu a6, a0, a3
+; RV32ZICOND-NEXT:    add a4, a4, a5
+; RV32ZICOND-NEXT:    mul a5, a1, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    mul a6, a0, a3
+; RV32ZICOND-NEXT:    j .LBB50_6
+; RV32ZICOND-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a5, a3, a0
+; RV32ZICOND-NEXT:    mulhu a6, a2, a1
+; RV32ZICOND-NEXT:    add a4, a4, a5
+; RV32ZICOND-NEXT:    mul a5, a3, a1
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    mul a6, a2, a1
+; RV32ZICOND-NEXT:  .LBB50_6: # %overflow.res
+; RV32ZICOND-NEXT:    add a6, a4, a6
+; RV32ZICOND-NEXT:    sltu a4, a6, a4
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    snez a4, a4
+; RV32ZICOND-NEXT:    j .LBB50_8
+; RV32ZICOND-NEXT:  .LBB50_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:  .LBB50_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -3726,7 +5196,11 @@ entry:
 
 define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.not.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB51_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB51_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -3745,6 +5219,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB51_7
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a2, a1, a2
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    mulhu a4, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    j .LBB51_6
+; RV32-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a0, a3, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    mulhu a4, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB51_7: # %overflow.no
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.not.i64:
 ; RV64:       # %bb.0: # %entry
@@ -3753,7 +5259,11 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.not.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB51_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB51_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -3772,6 +5282,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB51_7
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a2, a1, a2
+; RV32ZBA-NEXT:    add a2, a4, a2
+; RV32ZBA-NEXT:    mulhu a4, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    mul a0, a0, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    sltu a0, a0, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    j .LBB51_6
+; RV32ZBA-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a0, a3, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
+; RV32ZBA-NEXT:    mulhu a4, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a3, a4, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    xori a0, a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB51_7: # %overflow.no
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -3780,7 +5322,11 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB51_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB51_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -3799,6 +5345,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB51_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    add a2, a4, a2
+; RV32ZICOND-NEXT:    mulhu a4, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a4, a1
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB51_6
+; RV32ZICOND-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    add a0, a4, a0
+; RV32ZICOND-NEXT:    mulhu a4, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a3, a4, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB51_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a0, 1
+; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -4656,7 +6234,13 @@ continue:
 
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.br.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB61_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB61_6
+; RV32-NEXT:  # %bb.2: # %overflow1
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -4684,13 +6268,133 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    beqz a0, .LBB61_2
-; RV32-NEXT:  # %bb.1: # %overflow
-; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB61_2: # %continue
+; RV32-NEXT:    j .LBB61_26
+; RV32-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB61_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB61_10
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB61_11
+; RV32-NEXT:    j .LBB61_12
+; RV32-NEXT:  .LBB61_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB61_14
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB61_15
+; RV32-NEXT:    j .LBB61_16
+; RV32-NEXT:  .LBB61_8: # %overflow.no
+; RV32-NEXT:  .LBB61_9: # %continue
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB61_10:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB61_12
+; RV32-NEXT:  .LBB61_11: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB61_12: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB61_18
+; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB61_19
+; RV32-NEXT:  .LBB61_14:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB61_16
+; RV32-NEXT:  .LBB61_15: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB61_16: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB61_22
+; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB61_23
+; RV32-NEXT:  .LBB61_18:
+; RV32-NEXT:    neg a6, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a0, a7, a0
+; RV32-NEXT:  .LBB61_19: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti a7, a3, 0
+; RV32-NEXT:    bltz a3, .LBB61_21
+; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:  .LBB61_21: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a4, a6
+; RV32-NEXT:    mul a3, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a0
+; RV32-NEXT:    mulhu t0, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    xor a1, a7, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a4, a6, a3
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    add a4, a4, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    j .LBB61_26
+; RV32-NEXT:  .LBB61_22:
+; RV32-NEXT:    neg a6, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:  .LBB61_23: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti a7, a1, 0
+; RV32-NEXT:    bltz a1, .LBB61_25
+; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:  .LBB61_25: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a4, a6
+; RV32-NEXT:    mul a1, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a2
+; RV32-NEXT:    mulhu t0, a4, a2
+; RV32-NEXT:    mul a2, a4, a2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a1, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a4, a6, a1
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a4, a4, a3
+; RV32-NEXT:    xor a2, a2, a1
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    sltu a3, a4, a3
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sltu a2, a2, a3
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:  .LBB61_26: # %overflow.res
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB61_9
+; RV32-NEXT:  # %bb.27: # %overflow
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -4706,7 +6410,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB61_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB61_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow1
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a2, a1, a2
@@ -4734,13 +6444,133 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    beqz a0, .LBB61_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
-; RV32ZBA-NEXT:    li a0, 0
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB61_2: # %continue
+; RV32ZBA-NEXT:    j .LBB61_26
+; RV32ZBA-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB61_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB61_10
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB61_11
+; RV32ZBA-NEXT:    j .LBB61_12
+; RV32ZBA-NEXT:  .LBB61_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB61_14
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB61_15
+; RV32ZBA-NEXT:    j .LBB61_16
+; RV32ZBA-NEXT:  .LBB61_8: # %overflow.no
+; RV32ZBA-NEXT:  .LBB61_9: # %continue
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB61_10:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB61_12
+; RV32ZBA-NEXT:  .LBB61_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB61_12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB61_18
+; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB61_19
+; RV32ZBA-NEXT:  .LBB61_14:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB61_16
+; RV32ZBA-NEXT:  .LBB61_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB61_16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB61_22
+; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB61_23
+; RV32ZBA-NEXT:  .LBB61_18:
+; RV32ZBA-NEXT:    neg a6, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a0, a7, a0
+; RV32ZBA-NEXT:  .LBB61_19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti a7, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB61_21
+; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:  .LBB61_21: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a4, a6
+; RV32ZBA-NEXT:    mul a3, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a0
+; RV32ZBA-NEXT:    mulhu t0, a4, a0
+; RV32ZBA-NEXT:    mul a0, a4, a0
+; RV32ZBA-NEXT:    xor a1, a7, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a4, a6, a3
+; RV32ZBA-NEXT:    sltu a2, a0, a2
+; RV32ZBA-NEXT:    add a4, a4, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a2, a5, a2
+; RV32ZBA-NEXT:    sltu a1, a4, a1
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    xor a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    j .LBB61_26
+; RV32ZBA-NEXT:  .LBB61_22:
+; RV32ZBA-NEXT:    neg a6, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a2, a7, a2
+; RV32ZBA-NEXT:  .LBB61_23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti a7, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB61_25
+; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:  .LBB61_25: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a4, a6
+; RV32ZBA-NEXT:    mul a1, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a2
+; RV32ZBA-NEXT:    mulhu t0, a4, a2
+; RV32ZBA-NEXT:    mul a2, a4, a2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a1, a3
+; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a4, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    add a4, a4, a3
+; RV32ZBA-NEXT:    xor a2, a2, a1
+; RV32ZBA-NEXT:    add a0, a5, a0
+; RV32ZBA-NEXT:    sltu a3, a4, a3
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    sltu a2, a2, a3
+; RV32ZBA-NEXT:    xor a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:  .LBB61_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB61_9
+; RV32ZBA-NEXT:  # %bb.27: # %overflow
+; RV32ZBA-NEXT:    li a0, 0
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -4756,7 +6586,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB61_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -4784,11 +6620,123 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    beqz a0, .LBB61_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
+; RV32ZICOND-NEXT:    j .LBB61_6
+; RV32ZICOND-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_8
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    snez t0, a2
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    or a1, a6, a1
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a6, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu a7, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a6, t0
+; RV32ZICOND-NEXT:    add a2, a5, a2
+; RV32ZICOND-NEXT:    add a1, a7, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a0, a3
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB61_6
+; RV32ZICOND-NEXT:  .LBB61_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    snez t0, a0
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    or a3, a6, a3
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    mulhu a5, a2, a0
+; RV32ZICOND-NEXT:    mul a6, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu a7, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a6, t0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a3, a7, a3
+; RV32ZICOND-NEXT:    add a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a2, a2, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a1, a1, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB61_6: # %overflow.res
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    beqz a0, .LBB61_9
+; RV32ZICOND-NEXT:  # %bb.7: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB61_2: # %continue
+; RV32ZICOND-NEXT:  .LBB61_8: # %overflow.no
+; RV32ZICOND-NEXT:  .LBB61_9: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -4819,43 +6767,56 @@ continue:
 
 define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: smulo2.br.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a2, -13
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    li a4, -1
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    mul a6, a1, a2
-; RV32-NEXT:    mulhsu a2, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a6, a5, a6
-; RV32-NEXT:    sub a5, a5, a0
-; RV32-NEXT:    mulhsu a0, a4, a0
-; RV32-NEXT:    add a2, a2, a6
-; RV32-NEXT:    sltu a3, a5, a3
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    srai a3, a2, 31
-; RV32-NEXT:    srai a6, a0, 31
-; RV32-NEXT:    add a3, a3, a6
-; RV32-NEXT:    neg a6, a1
-; RV32-NEXT:    mulh a4, a1, a4
-; RV32-NEXT:    srai a5, a5, 31
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a2, a0, 31
+; RV32-NEXT:    beq a1, a2, .LBB62_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    slti a2, a1, 0
+; RV32-NEXT:    bltz a1, .LBB62_5
+; RV32-NEXT:  # %bb.2: # %overflow.lhs
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    xori a3, a2, 1
+; RV32-NEXT:    bgez a1, .LBB62_6
+; RV32-NEXT:    j .LBB62_7
+; RV32-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32-NEXT:  .LBB62_4: # %continue
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB62_5:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a3, a0
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    sub a5, a5, a3
+; RV32-NEXT:    xori a3, a2, 1
+; RV32-NEXT:    bltz a1, .LBB62_7
+; RV32-NEXT:  .LBB62_6: # %overflow.lhs
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB62_7: # %overflow.lhs
+; RV32-NEXT:    li a0, 13
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    mul a1, a4, a0
+; RV32-NEXT:    mulhu a4, a4, a0
+; RV32-NEXT:    mulhu a6, a5, a0
+; RV32-NEXT:    mul a0, a5, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    sltu a4, a0, a4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    sltu a1, a1, a3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a4, a2
 ; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    sltu a1, a0, a6
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    beqz a0, .LBB62_2
-; RV32-NEXT:  # %bb.1: # %overflow
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB62_4
+; RV32-NEXT:  # %bb.8: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB62_2: # %continue
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -4872,43 +6833,58 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a2, -13
-; RV32ZBA-NEXT:    neg a3, a0
-; RV32ZBA-NEXT:    li a4, -1
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    mul a6, a1, a2
-; RV32ZBA-NEXT:    mulhsu a2, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a6, a5, a6
-; RV32ZBA-NEXT:    sub a5, a5, a0
-; RV32ZBA-NEXT:    mulhsu a0, a4, a0
-; RV32ZBA-NEXT:    add a2, a2, a6
-; RV32ZBA-NEXT:    sltu a3, a5, a3
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a2, a0, 31
+; RV32ZBA-NEXT:    beq a1, a2, .LBB62_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    slti a2, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB62_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    xori a3, a2, 1
+; RV32ZBA-NEXT:    bgez a1, .LBB62_6
+; RV32ZBA-NEXT:    j .LBB62_7
+; RV32ZBA-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:  .LBB62_4: # %continue
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB62_5:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:    neg a5, a1
+; RV32ZBA-NEXT:    sub a5, a5, a3
+; RV32ZBA-NEXT:    xori a3, a2, 1
+; RV32ZBA-NEXT:    bltz a1, .LBB62_7
+; RV32ZBA-NEXT:  .LBB62_6: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB62_7: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a0, a4, a4
+; RV32ZBA-NEXT:    li a1, 13
+; RV32ZBA-NEXT:    sh1add a6, a5, a5
+; RV32ZBA-NEXT:    addi a2, a2, -1
+; RV32ZBA-NEXT:    sh2add a0, a0, a4
+; RV32ZBA-NEXT:    mulhu a4, a4, a1
+; RV32ZBA-NEXT:    sh2add a6, a6, a5
+; RV32ZBA-NEXT:    mulhu a1, a5, a1
+; RV32ZBA-NEXT:    add a6, a4, a6
+; RV32ZBA-NEXT:    xor a0, a0, a2
+; RV32ZBA-NEXT:    sltu a4, a6, a4
 ; RV32ZBA-NEXT:    add a0, a0, a3
-; RV32ZBA-NEXT:    srai a3, a2, 31
-; RV32ZBA-NEXT:    srai a6, a0, 31
-; RV32ZBA-NEXT:    add a3, a3, a6
-; RV32ZBA-NEXT:    neg a6, a1
-; RV32ZBA-NEXT:    mulh a4, a1, a4
-; RV32ZBA-NEXT:    srai a5, a5, 31
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    sltu a2, a0, a2
-; RV32ZBA-NEXT:    sub a0, a0, a1
-; RV32ZBA-NEXT:    add a2, a3, a2
-; RV32ZBA-NEXT:    sltu a1, a0, a6
-; RV32ZBA-NEXT:    add a2, a4, a2
-; RV32ZBA-NEXT:    add a1, a2, a1
-; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    xor a0, a0, a5
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    beqz a0, .LBB62_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
+; RV32ZBA-NEXT:    xor a5, a6, a2
+; RV32ZBA-NEXT:    add a1, a1, a4
+; RV32ZBA-NEXT:    sltu a0, a0, a3
+; RV32ZBA-NEXT:    add a5, a5, a0
+; RV32ZBA-NEXT:    sltu a0, a5, a0
+; RV32ZBA-NEXT:    xor a1, a1, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB62_4
+; RV32ZBA-NEXT:  # %bb.8: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB62_2: # %continue
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -4925,43 +6901,54 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    li a2, -13
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a2, a0, 31
+; RV32ZICOND-NEXT:    beq a1, a2, .LBB62_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    slti a2, a1, 0
 ; RV32ZICOND-NEXT:    neg a3, a0
-; RV32ZICOND-NEXT:    li a4, -1
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a6, a1, a2
-; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a6, a5, a6
-; RV32ZICOND-NEXT:    sub a5, a5, a0
-; RV32ZICOND-NEXT:    mulhsu a0, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a6
-; RV32ZICOND-NEXT:    sltu a3, a5, a3
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    srai a3, a2, 31
-; RV32ZICOND-NEXT:    srai a6, a0, 31
-; RV32ZICOND-NEXT:    add a3, a3, a6
-; RV32ZICOND-NEXT:    neg a6, a1
-; RV32ZICOND-NEXT:    mulh a4, a1, a4
-; RV32ZICOND-NEXT:    srai a5, a5, 31
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
-; RV32ZICOND-NEXT:    sub a0, a0, a1
-; RV32ZICOND-NEXT:    add a2, a3, a2
-; RV32ZICOND-NEXT:    sltu a1, a0, a6
-; RV32ZICOND-NEXT:    add a2, a4, a2
-; RV32ZICOND-NEXT:    add a1, a2, a1
-; RV32ZICOND-NEXT:    xor a1, a1, a5
-; RV32ZICOND-NEXT:    xor a0, a0, a5
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    beqz a0, .LBB62_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
-; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    neg a5, a1
+; RV32ZICOND-NEXT:    li a6, 13
+; RV32ZICOND-NEXT:    sub a5, a5, a4
+; RV32ZICOND-NEXT:    xori a4, a2, 1
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    or a3, a3, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
+; RV32ZICOND-NEXT:    or a5, a5, a1
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
+; RV32ZICOND-NEXT:    addi a2, a2, -1
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    or a1, a5, a1
+; RV32ZICOND-NEXT:    mul a3, a0, a6
+; RV32ZICOND-NEXT:    mulhu a0, a0, a6
+; RV32ZICOND-NEXT:    mulhu a5, a1, a6
+; RV32ZICOND-NEXT:    mul a1, a1, a6
+; RV32ZICOND-NEXT:    xor a3, a3, a2
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    xor a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a1, a1, a3
+; RV32ZICOND-NEXT:    sltu a1, a1, a3
+; RV32ZICOND-NEXT:    xor a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    bnez a0, .LBB62_4
 ; RV32ZICOND-NEXT:  .LBB62_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    j .LBB62_2
+; RV32ZICOND-NEXT:  .LBB62_4: # %overflow
+; RV32ZICOND-NEXT:    li a0, 0
+; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -5079,7 +7066,11 @@ continue:
 
 define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.br.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB64_4
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB64_6
+; RV32-NEXT:  # %bb.2: # %overflow1
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -5096,13 +7087,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a2, a4, a6
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    beqz a0, .LBB64_2
-; RV32-NEXT:  # %bb.1: # %overflow
-; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB64_2: # %continue
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    bnez a0, .LBB64_8
+; RV32-NEXT:  .LBB64_3: # %continue
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB64_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a2, a1, a2
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    mulhu a4, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    j .LBB64_7
+; RV32-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a0, a3, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    mulhu a4, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB64_3
+; RV32-NEXT:  .LBB64_8: # %overflow
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB64_9: # %overflow.no
+; RV32-NEXT:    j .LBB64_3
 ;
 ; RV64-LABEL: umulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -5116,7 +7139,11 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB64_4
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB64_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow1
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -5133,13 +7160,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a2, a4, a6
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a2
-; RV32ZBA-NEXT:    beqz a0, .LBB64_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
-; RV32ZBA-NEXT:    li a0, 0
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB64_2: # %continue
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    bnez a0, .LBB64_8
+; RV32ZBA-NEXT:  .LBB64_3: # %continue
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB64_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a2, a1, a2
+; RV32ZBA-NEXT:    add a2, a4, a2
+; RV32ZBA-NEXT:    mulhu a4, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    mul a0, a0, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    sltu a0, a0, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    j .LBB64_7
+; RV32ZBA-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a0, a3, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
+; RV32ZBA-NEXT:    mulhu a4, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a3, a4, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB64_3
+; RV32ZBA-NEXT:  .LBB64_8: # %overflow
+; RV32ZBA-NEXT:    li a0, 0
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB64_9: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB64_3
 ;
 ; RV64ZBA-LABEL: umulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -5153,7 +7212,11 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB64_4
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB64_6
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -5170,13 +7233,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    sltu a2, a4, a6
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
-; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB64_2: # %continue
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    bnez a0, .LBB64_8
+; RV32ZICOND-NEXT:  .LBB64_3: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB64_9
+; RV32ZICOND-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    add a2, a4, a2
+; RV32ZICOND-NEXT:    mulhu a4, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a4, a1
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB64_7
+; RV32ZICOND-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    add a0, a4, a0
+; RV32ZICOND-NEXT:    mulhu a4, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a3, a4, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    beqz a0, .LBB64_3
+; RV32ZICOND-NEXT:  .LBB64_8: # %overflow
+; RV32ZICOND-NEXT:    li a0, 0
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB64_9: # %overflow.no
+; RV32ZICOND-NEXT:    j .LBB64_3
 ;
 ; RV64ZICOND-LABEL: umulo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -5203,16 +7298,13 @@ continue:
 
 define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: umulo2.br.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a2, a0, a0
-; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a2, a1, a1
-; RV32-NEXT:    add a2, a2, a0
-; RV32-NEXT:    beq a2, a1, .LBB65_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    sltu a0, a2, a1
-; RV32-NEXT:  .LBB65_2: # %entry
-; RV32-NEXT:    beqz a0, .LBB65_4
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB65_2
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:  .LBB65_2: # %overflow.res
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    beqz a1, .LBB65_4
 ; RV32-NEXT:  # %bb.3: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -5232,16 +7324,13 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a2, a0, a0
-; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a2, a1, a1
-; RV32ZBA-NEXT:    add a2, a2, a0
-; RV32ZBA-NEXT:    beq a2, a1, .LBB65_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
-; RV32ZBA-NEXT:    sltu a0, a2, a1
-; RV32ZBA-NEXT:  .LBB65_2: # %entry
-; RV32ZBA-NEXT:    beqz a0, .LBB65_4
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB65_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    srli a1, a1, 31
+; RV32ZBA-NEXT:  .LBB65_2: # %overflow.res
+; RV32ZBA-NEXT:    andi a1, a1, 1
+; RV32ZBA-NEXT:    beqz a1, .LBB65_4
 ; RV32ZBA-NEXT:  # %bb.3: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -5261,21 +7350,17 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    add a2, a0, a0
-; RV32ZICOND-NEXT:    add a3, a1, a1
-; RV32ZICOND-NEXT:    sltu a0, a2, a0
-; RV32ZICOND-NEXT:    add a3, a3, a0
-; RV32ZICOND-NEXT:    xor a2, a3, a1
-; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    beqz a0, .LBB65_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB65_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    srli a1, a1, 31
+; RV32ZICOND-NEXT:  .LBB65_2: # %overflow.res
+; RV32ZICOND-NEXT:    andi a1, a1, 1
+; RV32ZICOND-NEXT:    beqz a1, .LBB65_4
+; RV32ZICOND-NEXT:  # %bb.3: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB65_2: # %continue
+; RV32ZICOND-NEXT:  .LBB65_4: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index 1e5ab7922de08..ff846adf7e138 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -5,93 +5,106 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %overflow.entry
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    ld [%fp+96], %l2
-; SPARC-NEXT:    mov %i3, %g2
-; SPARC-NEXT:    mov %i2, %g3
-; SPARC-NEXT:    umul %i1, %l2, %l0
-; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    ld [%fp+92], %l1
-; SPARC-NEXT:    umul %i0, %l2, %i3
-; SPARC-NEXT:    rd %y, %g4
-; SPARC-NEXT:    addcc %i3, %i2, %i2
-; SPARC-NEXT:    addxcc %g4, 0, %i3
-; SPARC-NEXT:    umul %i1, %l1, %g4
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %g4, %i2, %l4
-; SPARC-NEXT:    addxcc %l3, 0, %i2
-; SPARC-NEXT:    addcc %i3, %i2, %i2
-; SPARC-NEXT:    addxcc %g0, 0, %i3
-; SPARC-NEXT:    umul %i0, %l1, %g4
+; SPARC-NEXT:    ld [%fp+96], %g3
+; SPARC-NEXT:    ld [%fp+92], %l0
+; SPARC-NEXT:    sra %i2, 31, %g2
+; SPARC-NEXT:    xor %i0, %g2, %g4
+; SPARC-NEXT:    xor %i1, %g2, %g2
+; SPARC-NEXT:    or %g2, %g4, %g2
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    sra %l0, 31, %g2
+; SPARC-NEXT:    xor %i4, %g2, %g4
+; SPARC-NEXT:    xor %i5, %g2, %g2
+; SPARC-NEXT:    be .LBB0_4
+; SPARC-NEXT:    or %g2, %g4, %g2
+; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    be .LBB0_15
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.2: ! %overflow
+; SPARC-NEXT:    umul %i1, %g3, %l1
+; SPARC-NEXT:    rd %y, %g2
+; SPARC-NEXT:    umul %i0, %g3, %g4
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %g4, %g2, %g2
+; SPARC-NEXT:    addxcc %l2, 0, %g4
+; SPARC-NEXT:    umul %i1, %l0, %l2
 ; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %g4, %i2, %i2
+; SPARC-NEXT:    addcc %l2, %g2, %l2
+; SPARC-NEXT:    addxcc %l3, 0, %g2
+; SPARC-NEXT:    addcc %g4, %g2, %g2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %i0, %l0, %g4
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g4, %g2, %g2
 ; SPARC-NEXT:    sra %i0, 31, %g4
-; SPARC-NEXT:    smul %l1, %g4, %l5
-; SPARC-NEXT:    umul %l2, %g4, %l6
+; SPARC-NEXT:    smul %l0, %g4, %l5
+; SPARC-NEXT:    umul %g3, %g4, %l6
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addxcc %l3, %i3, %l3
-; SPARC-NEXT:    add %l7, %l6, %i3
-; SPARC-NEXT:    add %i3, %l5, %l5
-; SPARC-NEXT:    addcc %i2, %l6, %l6
-; SPARC-NEXT:    umul %g2, %l2, %i3
-; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l3, %l5, %l3
-; SPARC-NEXT:    umul %g3, %l2, %l2
-; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addcc %l2, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %l2
-; SPARC-NEXT:    umul %g2, %l1, %l5
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l7, %l6, %l4
+; SPARC-NEXT:    add %l4, %l5, %l4
+; SPARC-NEXT:    addcc %g2, %l6, %l5
+; SPARC-NEXT:    umul %i3, %g3, %g2
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l3, %l4, %l3
+; SPARC-NEXT:    umul %i2, %g3, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g3, %l6, %g3
+; SPARC-NEXT:    addxcc %l4, 0, %l4
+; SPARC-NEXT:    umul %i3, %l0, %l6
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l5, %i2, %i2
-; SPARC-NEXT:    addxcc %l7, 0, %l5
-; SPARC-NEXT:    addcc %l2, %l5, %l2
-; SPARC-NEXT:    addxcc %g0, 0, %l5
-; SPARC-NEXT:    umul %g3, %l1, %l1
+; SPARC-NEXT:    addcc %l6, %g3, %g3
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    addcc %l4, %l6, %l4
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %i2, %l0, %l0
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l1, %l2, %l1
-; SPARC-NEXT:    addxcc %l7, %l5, %l2
-; SPARC-NEXT:    addcc %l0, %l1, %l0
-; SPARC-NEXT:    addxcc %l4, %l2, %l1
-; SPARC-NEXT:    addxcc %l6, 0, %l2
-; SPARC-NEXT:    addxcc %l3, 0, %l3
-; SPARC-NEXT:    umul %g2, %i5, %l4
+; SPARC-NEXT:    addcc %l0, %l4, %l0
+; SPARC-NEXT:    addxcc %l7, %l6, %l4
+; SPARC-NEXT:    addcc %l1, %l0, %l0
+; SPARC-NEXT:    addxcc %l2, %l4, %l1
+; SPARC-NEXT:    addxcc %l5, 0, %l2
+; SPARC-NEXT:    umul %i2, %i5, %l4
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    sra %l3, 31, %l6
-; SPARC-NEXT:    umul %g3, %i5, %l7
-; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    addcc %l7, %l5, %l5
-; SPARC-NEXT:    addxcc %o0, 0, %l7
-; SPARC-NEXT:    umul %g2, %i4, %o0
+; SPARC-NEXT:    addxcc %l3, 0, %l3
+; SPARC-NEXT:    umul %i3, %i5, %l6
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    sra %l3, 31, %o0
+; SPARC-NEXT:    addcc %l4, %l7, %l4
+; SPARC-NEXT:    addxcc %l5, 0, %l5
+; SPARC-NEXT:    umul %i3, %i4, %l7
 ; SPARC-NEXT:    rd %y, %o1
-; SPARC-NEXT:    addcc %o0, %l5, %l5
-; SPARC-NEXT:    addxcc %o1, 0, %o0
-; SPARC-NEXT:    addcc %l7, %o0, %l7
-; SPARC-NEXT:    addxcc %g0, 0, %o0
-; SPARC-NEXT:    umul %g3, %i4, %o1
+; SPARC-NEXT:    addcc %l7, %l4, %l4
+; SPARC-NEXT:    addxcc %o1, 0, %l7
+; SPARC-NEXT:    addcc %l5, %l7, %l5
+; SPARC-NEXT:    addxcc %g0, 0, %l7
+; SPARC-NEXT:    umul %i2, %i4, %o1
 ; SPARC-NEXT:    rd %y, %o2
-; SPARC-NEXT:    addcc %o1, %l7, %l7
+; SPARC-NEXT:    addcc %o1, %l5, %l5
 ; SPARC-NEXT:    sra %i4, 31, %o1
-; SPARC-NEXT:    smul %o1, %g3, %g3
-; SPARC-NEXT:    umul %o1, %g2, %g2
+; SPARC-NEXT:    smul %o1, %i2, %i2
+; SPARC-NEXT:    umul %o1, %i3, %i3
 ; SPARC-NEXT:    rd %y, %o3
-; SPARC-NEXT:    addxcc %o2, %o0, %o0
-; SPARC-NEXT:    add %o3, %g3, %g3
-; SPARC-NEXT:    add %g3, %g2, %g3
-; SPARC-NEXT:    addcc %l7, %g2, %l7
-; SPARC-NEXT:    addxcc %o0, %g3, %o0
-; SPARC-NEXT:    addcc %l4, %l0, %g2
-; SPARC-NEXT:    addxcc %l5, %l1, %g3
-; SPARC-NEXT:    addxcc %l7, 0, %l0
-; SPARC-NEXT:    addxcc %o0, 0, %l1
+; SPARC-NEXT:    addxcc %o2, %l7, %l7
+; SPARC-NEXT:    add %o3, %i2, %i2
+; SPARC-NEXT:    add %i2, %i3, %i2
+; SPARC-NEXT:    addcc %l5, %i3, %i3
+; SPARC-NEXT:    addxcc %l7, %i2, %l5
+; SPARC-NEXT:    addcc %l6, %l0, %i2
+; SPARC-NEXT:    addxcc %l4, %l1, %l0
+; SPARC-NEXT:    addxcc %i3, 0, %i3
+; SPARC-NEXT:    addxcc %l5, 0, %l1
 ; SPARC-NEXT:    sra %l1, 31, %l4
-; SPARC-NEXT:    addcc %l2, %l0, %l0
+; SPARC-NEXT:    addcc %l2, %i3, %i3
 ; SPARC-NEXT:    addxcc %l3, %l1, %l1
-; SPARC-NEXT:    addxcc %l6, %l4, %l2
+; SPARC-NEXT:    addxcc %o0, %l4, %l2
 ; SPARC-NEXT:    smul %i4, %g4, %l3
 ; SPARC-NEXT:    umul %i5, %g4, %g4
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addxcc %l6, %l4, %l4
+; SPARC-NEXT:    addxcc %o0, %l4, %l4
 ; SPARC-NEXT:    add %l5, %g4, %l5
 ; SPARC-NEXT:    smul %o1, %i0, %l6
 ; SPARC-NEXT:    umul %o1, %i1, %l7
@@ -113,150 +126,1050 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-NEXT:    addxcc %l7, 0, %i5
 ; SPARC-NEXT:    addcc %l5, %i5, %i5
 ; SPARC-NEXT:    addxcc %g0, 0, %l5
-; SPARC-NEXT:    umul %i0, %i4, %i0
-; SPARC-NEXT:    rd %y, %i4
-; SPARC-NEXT:    addcc %i0, %i5, %i0
-; SPARC-NEXT:    addxcc %i4, %l5, %i4
-; SPARC-NEXT:    addcc %i0, %g4, %i0
-; SPARC-NEXT:    addxcc %i4, %l3, %i4
-; SPARC-NEXT:    addcc %l6, %l0, %i5
+; SPARC-NEXT:    umul %i0, %i4, %i4
+; SPARC-NEXT:    mov %l0, %i0
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %i4, %i5, %i4
+; SPARC-NEXT:    addxcc %l0, %l5, %i5
+; SPARC-NEXT:    addcc %i4, %g4, %i4
+; SPARC-NEXT:    addxcc %i5, %l3, %i5
+; SPARC-NEXT:    addcc %l6, %i3, %i3
 ; SPARC-NEXT:    addxcc %i1, %l1, %i1
-; SPARC-NEXT:    addxcc %i0, %l2, %i0
-; SPARC-NEXT:    addxcc %i4, %l4, %i4
-; SPARC-NEXT:    sra %g3, 31, %g4
-; SPARC-NEXT:    xor %i4, %g4, %i4
+; SPARC-NEXT:    addxcc %i4, %l2, %i4
+; SPARC-NEXT:    addxcc %i5, %l4, %i5
+; SPARC-NEXT:    sra %i0, 31, %g4
+; SPARC-NEXT:    xor %i5, %g4, %i5
 ; SPARC-NEXT:    xor %i1, %g4, %i1
-; SPARC-NEXT:    or %i1, %i4, %i1
-; SPARC-NEXT:    xor %i0, %g4, %i0
-; SPARC-NEXT:    xor %i5, %g4, %i4
-; SPARC-NEXT:    or %i4, %i0, %i0
-; SPARC-NEXT:    or %i0, %i1, %i0
+; SPARC-NEXT:    or %i1, %i5, %i1
+; SPARC-NEXT:    xor %i4, %g4, %i4
+; SPARC-NEXT:    xor %i3, %g4, %i3
+; SPARC-NEXT:    or %i3, %i4, %i3
+; SPARC-NEXT:    or %i3, %i1, %i1
+; SPARC-NEXT:    cmp %i1, 0
+; SPARC-NEXT:    bne .LBB0_110
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.3: ! %overflow
+; SPARC-NEXT:    ba .LBB0_111
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_4: ! %overflow.no.lhs
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    be .LBB0_25
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.5: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_7
+; SPARC-NEXT:    mov %g4, %g2
+; SPARC-NEXT:  ! %bb.6: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g2
+; SPARC-NEXT:  .LBB0_7: ! %overflow.no.lhs.only
+; SPARC-NEXT:    subcc %g0, %i3, %l4
+; SPARC-NEXT:    subxcc %g0, %i2, %l3
+; SPARC-NEXT:    subxcc %g0, %i1, %l1
+; SPARC-NEXT:    subxcc %g0, %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_26
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.8: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i3, %l4
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_27
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_9: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_28
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_10: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_29
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_11: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_30
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_12: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i1, %l1
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_31
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_13: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_32
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_14: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i2, %l3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_33
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_34
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_15: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_17
+; SPARC-NEXT:    mov %g4, %g2
+; SPARC-NEXT:  ! %bb.16: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %g2
+; SPARC-NEXT:  .LBB0_17: ! %overflow.no.rhs.only
+; SPARC-NEXT:    subcc %g0, %g3, %l4
+; SPARC-NEXT:    subxcc %g0, %l0, %l3
+; SPARC-NEXT:    subxcc %g0, %i5, %l1
+; SPARC-NEXT:    subxcc %g0, %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_44
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.18: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g3, %l4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_45
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_19: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_20: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_47
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_21: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_48
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_22: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i5, %l1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_49
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_23: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_50
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_24: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l0, %l3
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_51
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_52
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_25: ! %overflow.no
+; SPARC-NEXT:    smul %g3, %i0, %g2
+; SPARC-NEXT:    umul %g3, %i1, %i0
+; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:    add %l1, %g2, %g2
+; SPARC-NEXT:    smul %l0, %i1, %i1
+; SPARC-NEXT:    smul %i5, %i2, %l1
+; SPARC-NEXT:    umul %i5, %i3, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    add %g2, %i1, %i1
+; SPARC-NEXT:    add %l2, %l1, %g2
+; SPARC-NEXT:    smul %i4, %i3, %i4
+; SPARC-NEXT:    add %g2, %i4, %i4
+; SPARC-NEXT:    addcc %i5, %i0, %i0
+; SPARC-NEXT:    umul %i3, %g3, %g2
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    addxcc %i4, %i1, %i4
+; SPARC-NEXT:    umul %i2, %g3, %i1
+; SPARC-NEXT:    rd %y, %g3
+; SPARC-NEXT:    addcc %i1, %i5, %i1
+; SPARC-NEXT:    addxcc %g3, 0, %i5
+; SPARC-NEXT:    umul %i3, %l0, %i3
+; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    addcc %i3, %i1, %g3
+; SPARC-NEXT:    addxcc %l1, 0, %i1
+; SPARC-NEXT:    addcc %i5, %i1, %i1
+; SPARC-NEXT:    addxcc %g0, 0, %i3
+; SPARC-NEXT:    umul %i2, %l0, %i2
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    addcc %i2, %i1, %i1
+; SPARC-NEXT:    addxcc %i5, %i3, %i2
+; SPARC-NEXT:    addcc %i1, %i0, %i1
+; SPARC-NEXT:    ba .LBB0_112
+; SPARC-NEXT:    addxcc %i2, %i4, %i0
+; SPARC-NEXT:  .LBB0_26: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_9
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_27: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i2, %l3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_10
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_28: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_11
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_29: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i1, %l1
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_12
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_30: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_13
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_31: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_14
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_32: ! %overflow.no.lhs.only
 ; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bne .LBB0_2
+; SPARC-NEXT:    bl .LBB0_34
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i3, %l4
+; SPARC-NEXT:  .LBB0_34: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_36
+; SPARC-NEXT:    mov %g4, %i0
+; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
+; SPARC-NEXT:    subcc %g0, %g3, %l6
+; SPARC-NEXT:    subxcc %g0, %l0, %l5
+; SPARC-NEXT:    subxcc %g0, %i5, %i2
+; SPARC-NEXT:    subxcc %g0, %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_62
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g3, %l6
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_63
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_64
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_39: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i5, %i2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_65
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_40: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_66
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_41: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_67
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_42: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_68
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_43: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l0, %l5
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_69
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_70
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.1: ! %start
-; SPARC-NEXT:    ba .LBB0_3
+; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_19
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_45: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l0, %l3
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_20
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_21
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_47: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i5, %l1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_22
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_48: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_23
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_49: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_24
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_50: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_52
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_51: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g3, %l4
+; SPARC-NEXT:  .LBB0_52: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_54
+; SPARC-NEXT:    mov %g4, %i4
+; SPARC-NEXT:  ! %bb.53: ! %overflow.no.rhs.only
 ; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_2:
-; SPARC-NEXT:    mov 1, %i4
-; SPARC-NEXT:  .LBB0_3: ! %start
-; SPARC-NEXT:    mov %g3, %i0
+; SPARC-NEXT:  .LBB0_54: ! %overflow.no.rhs.only
+; SPARC-NEXT:    subcc %g0, %i3, %l5
+; SPARC-NEXT:    subxcc %g0, %i2, %l0
+; SPARC-NEXT:    subxcc %g0, %i1, %g3
+; SPARC-NEXT:    subxcc %g0, %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_85
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.55: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i3, %l5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_86
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_56: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_87
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_57: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i1, %g3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_88
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_58: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_89
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_59: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_90
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_60: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_91
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_61: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i2, %l0
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_92
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_93
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_62: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_38
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_63: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l0, %l5
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_39
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_64: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_40
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_65: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_41
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_66: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_42
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_67: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i5, %i2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_43
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_68: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_70
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_69: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g3, %l6
+; SPARC-NEXT:  .LBB0_70: ! %overflow.no.lhs.only
+; SPARC-NEXT:    umul %l4, %l6, %i3
+; SPARC-NEXT:    rd %y, %i4
+; SPARC-NEXT:    umul %l3, %l6, %i5
+; SPARC-NEXT:    rd %y, %g3
+; SPARC-NEXT:    addcc %i5, %i4, %i4
+; SPARC-NEXT:    addxcc %g3, 0, %i5
+; SPARC-NEXT:    umul %l4, %l5, %g3
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %g3, %i4, %i4
+; SPARC-NEXT:    addxcc %l0, 0, %g3
+; SPARC-NEXT:    addcc %i5, %g3, %i5
+; SPARC-NEXT:    addxcc %g0, 0, %g3
+; SPARC-NEXT:    umul %l3, %l5, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %i5, %i5
+; SPARC-NEXT:    smul %l6, %l2, %l0
+; SPARC-NEXT:    umul %l6, %l1, %l6
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    addxcc %l7, %g3, %l7
+; SPARC-NEXT:    add %o0, %l0, %g3
+; SPARC-NEXT:    smul %l5, %l1, %l0
+; SPARC-NEXT:    add %g3, %l0, %l0
+; SPARC-NEXT:    addcc %i5, %l6, %g3
+; SPARC-NEXT:    umul %l4, %i2, %l5
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l7, %l0, %i5
+; SPARC-NEXT:    umul %l3, %i2, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %l6, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    umul %l4, %i1, %l4
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l4, %l0, %l4
+; SPARC-NEXT:    addxcc %l7, 0, %l0
+; SPARC-NEXT:    addcc %l6, %l0, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %l3, %i1, %l3
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %i2, %l2, %l2
+; SPARC-NEXT:    umul %i2, %l1, %i2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l7, %l6, %l6
+; SPARC-NEXT:    add %l3, %l2, %l2
+; SPARC-NEXT:    smul %i1, %l1, %i1
+; SPARC-NEXT:    add %l2, %i1, %i1
+; SPARC-NEXT:    addcc %l0, %i2, %l0
+; SPARC-NEXT:    addxcc %l6, %i1, %l1
+; SPARC-NEXT:    addcc %g3, %l5, %i1
+; SPARC-NEXT:    addxcc %i5, %l4, %i2
+; SPARC-NEXT:    cmp %i2, %i5
+; SPARC-NEXT:    bcs .LBB0_72
+; SPARC-NEXT:    mov %g4, %l2
+; SPARC-NEXT:  ! %bb.71: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %l2
+; SPARC-NEXT:  .LBB0_72: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i1, %g3
+; SPARC-NEXT:    bcs .LBB0_74
+; SPARC-NEXT:    mov %g4, %g3
+; SPARC-NEXT:  ! %bb.73: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_74: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i2, %i5
+; SPARC-NEXT:    be .LBB0_76
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.75: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l2, %g3
+; SPARC-NEXT:  .LBB0_76: ! %overflow.no.lhs.only
+; SPARC-NEXT:    addcc %l0, %g3, %i5
+; SPARC-NEXT:    addxcc %l1, 0, %l0
+; SPARC-NEXT:    xor %i0, %g2, %i0
+; SPARC-NEXT:    sub %g0, %i0, %l1
+; SPARC-NEXT:    xor %i4, %l1, %i4
+; SPARC-NEXT:    xor %i3, %l1, %i3
+; SPARC-NEXT:    addcc %i3, %i0, %g2
+; SPARC-NEXT:    addxcc %i4, 0, %g3
+; SPARC-NEXT:    cmp %g2, %i0
+; SPARC-NEXT:    bcs .LBB0_78
+; SPARC-NEXT:    mov %g4, %i3
+; SPARC-NEXT:  ! %bb.77: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_78: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %g3, 0
+; SPARC-NEXT:    be .LBB0_80
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.79: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_80: ! %overflow.no.lhs.only
+; SPARC-NEXT:    xor %i1, %l1, %i0
+; SPARC-NEXT:    xor %i2, %l1, %i2
+; SPARC-NEXT:    addcc %i0, %i3, %i1
+; SPARC-NEXT:    addxcc %i2, 0, %i0
+; SPARC-NEXT:    cmp %i1, %i3
+; SPARC-NEXT:    bcs .LBB0_82
+; SPARC-NEXT:    mov %g4, %i2
+; SPARC-NEXT:  ! %bb.81: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i2
+; SPARC-NEXT:  .LBB0_82: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    be .LBB0_84
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.83: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i2
+; SPARC-NEXT:  .LBB0_84: ! %overflow.no.lhs.only
+; SPARC-NEXT:    xor %i5, %l1, %i3
+; SPARC-NEXT:    xor %l0, %l1, %i4
+; SPARC-NEXT:    addcc %i3, %i2, %i2
+; SPARC-NEXT:    ba .LBB0_108
+; SPARC-NEXT:    addxcc %i4, 0, %i3
+; SPARC-NEXT:  .LBB0_85: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_56
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_86: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i2, %l0
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_57
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_87: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_58
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_88: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_59
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_89: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_60
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_90: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i1, %g3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_61
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_91: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_93
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_92: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i3, %l5
+; SPARC-NEXT:  .LBB0_93: ! %overflow.no.rhs.only
+; SPARC-NEXT:    umul %l4, %l5, %i0
+; SPARC-NEXT:    rd %y, %i1
+; SPARC-NEXT:    umul %l3, %l5, %i2
+; SPARC-NEXT:    rd %y, %i3
+; SPARC-NEXT:    addcc %i2, %i1, %i1
+; SPARC-NEXT:    addxcc %i3, 0, %i2
+; SPARC-NEXT:    umul %l4, %l0, %i3
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addcc %i3, %i1, %i1
+; SPARC-NEXT:    addxcc %l6, 0, %i3
+; SPARC-NEXT:    addcc %i2, %i3, %i2
+; SPARC-NEXT:    addxcc %g0, 0, %i3
+; SPARC-NEXT:    umul %l3, %l0, %l6
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l6, %i2, %i2
+; SPARC-NEXT:    smul %l5, %l2, %l6
+; SPARC-NEXT:    umul %l5, %l1, %l5
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    addxcc %l7, %i3, %l7
+; SPARC-NEXT:    add %o0, %l6, %i3
+; SPARC-NEXT:    smul %l0, %l1, %l0
+; SPARC-NEXT:    add %i3, %l0, %l0
+; SPARC-NEXT:    addcc %i2, %l5, %i3
+; SPARC-NEXT:    umul %l4, %g3, %l5
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l7, %l0, %i2
+; SPARC-NEXT:    umul %l3, %g3, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %l6, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    umul %l4, %i5, %l4
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l4, %l0, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l4
+; SPARC-NEXT:    addcc %l6, %l4, %l4
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %l3, %i5, %l3
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l3, %l4, %l3
+; SPARC-NEXT:    smul %g3, %l2, %l2
+; SPARC-NEXT:    umul %g3, %l1, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addxcc %l7, %l6, %l6
+; SPARC-NEXT:    add %l4, %l2, %l2
+; SPARC-NEXT:    smul %i5, %l1, %i5
+; SPARC-NEXT:    add %l2, %i5, %i5
+; SPARC-NEXT:    addcc %l3, %g3, %g3
+; SPARC-NEXT:    addxcc %l6, %i5, %l1
+; SPARC-NEXT:    addcc %i3, %l5, %i5
+; SPARC-NEXT:    addxcc %i2, %l0, %l0
+; SPARC-NEXT:    cmp %l0, %i2
+; SPARC-NEXT:    bcs .LBB0_95
+; SPARC-NEXT:    mov %g4, %l2
+; SPARC-NEXT:  ! %bb.94: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l2
+; SPARC-NEXT:  .LBB0_95: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i5, %i3
+; SPARC-NEXT:    bcs .LBB0_97
+; SPARC-NEXT:    mov %g4, %i3
+; SPARC-NEXT:  ! %bb.96: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_97: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %l0, %i2
+; SPARC-NEXT:    be .LBB0_99
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.98: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l2, %i3
+; SPARC-NEXT:  .LBB0_99: ! %overflow.no.rhs.only
+; SPARC-NEXT:    addcc %g3, %i3, %i2
+; SPARC-NEXT:    addxcc %l1, 0, %i3
+; SPARC-NEXT:    xor %g2, %i4, %l1
+; SPARC-NEXT:    sub %g0, %l1, %i4
+; SPARC-NEXT:    xor %i1, %i4, %i1
+; SPARC-NEXT:    xor %i0, %i4, %i0
+; SPARC-NEXT:    addcc %i0, %l1, %g2
+; SPARC-NEXT:    addxcc %i1, 0, %g3
+; SPARC-NEXT:    cmp %g2, %l1
+; SPARC-NEXT:    bcs .LBB0_101
+; SPARC-NEXT:    mov %g4, %l1
+; SPARC-NEXT:  ! %bb.100: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_101: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %g3, 0
+; SPARC-NEXT:    be .LBB0_103
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.102: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_103: ! %overflow.no.rhs.only
+; SPARC-NEXT:    xor %i5, %i4, %i0
+; SPARC-NEXT:    xor %l0, %i4, %i5
+; SPARC-NEXT:    addcc %i0, %l1, %i1
+; SPARC-NEXT:    addxcc %i5, 0, %i0
+; SPARC-NEXT:    cmp %i1, %l1
+; SPARC-NEXT:    bcs .LBB0_105
+; SPARC-NEXT:    mov %g4, %i5
+; SPARC-NEXT:  ! %bb.104: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_105: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    be .LBB0_107
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.106: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_107: ! %overflow.no.rhs.only
+; SPARC-NEXT:    xor %i2, %i4, %i2
+; SPARC-NEXT:    xor %i3, %i4, %i3
+; SPARC-NEXT:    addcc %i2, %i5, %i2
+; SPARC-NEXT:    addxcc %i3, 0, %i3
+; SPARC-NEXT:  .LBB0_108: ! %overflow.no.rhs.only
+; SPARC-NEXT:    or %i2, %i3, %i2
+; SPARC-NEXT:    cmp %i2, 0
+; SPARC-NEXT:    bne .LBB0_112
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.109: ! %overflow.no.rhs.only
+; SPARC-NEXT:    ba .LBB0_112
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_110:
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:  .LBB0_111: ! %overflow
+; SPARC-NEXT:    mov %i2, %i1
+; SPARC-NEXT:  .LBB0_112: ! %overflow.res
+; SPARC-NEXT:    and %g4, 1, %i4
+; SPARC-NEXT:    mov %g3, %i2
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o1
+; SPARC-NEXT:    restore %g0, %g2, %o3
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %start
+; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    mov %i3, %i4
-; SPARC64-NEXT:    mov %i1, %i5
-; SPARC64-NEXT:    mov %i0, %l2
-; SPARC64-NEXT:    srax %i0, 63, %i3
-; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %i1, %i4
+; SPARC64-NEXT:    srax %i1, 63, %i1
+; SPARC64-NEXT:    cmp %i0, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_3
+; SPARC64-NEXT:    srax %i3, 63, %i1
+; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-NEXT:    cmp %i2, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_5
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %overflow
+; SPARC64-NEXT:    srax %i0, 63, %i5
+; SPARC64-NEXT:    mov %i5, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
 ; SPARC64-NEXT:    mov %o1, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i1, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:    add %l1, %o0, %l3
-; SPARC64-NEXT:    cmp %l3, %l1
-; SPARC64-NEXT:    movcs %xcc, 1, %i0
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    add %l0, %i0, %l0
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    add %l1, %o0, %l2
+; SPARC64-NEXT:    cmp %l2, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %l0, %i3, %l0
 ; SPARC64-NEXT:    srax %l0, 63, %l1
-; SPARC64-NEXT:    srax %i2, 63, %i4
+; SPARC64-NEXT:    srax %i2, 63, %i3
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i5, %o1
-; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i3, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i4
 ; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    add %o1, %l3, %i0
-; SPARC64-NEXT:    cmp %i0, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i5
-; SPARC64-NEXT:    srl %i5, 0, %i5
-; SPARC64-NEXT:    add %o0, %i5, %i5
-; SPARC64-NEXT:    srax %i5, 63, %g3
-; SPARC64-NEXT:    add %l1, %g3, %g3
-; SPARC64-NEXT:    add %l0, %i5, %i5
-; SPARC64-NEXT:    cmp %i5, %l0
+; SPARC64-NEXT:    add %o1, %l2, %g3
+; SPARC64-NEXT:    cmp %g3, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    srl %i4, 0, %i4
+; SPARC64-NEXT:    add %o0, %i4, %i4
+; SPARC64-NEXT:    srax %i4, 63, %g4
+; SPARC64-NEXT:    add %l1, %g4, %g4
+; SPARC64-NEXT:    add %l0, %i4, %i4
+; SPARC64-NEXT:    cmp %i4, %l0
 ; SPARC64-NEXT:    movcs %xcc, 1, %g2
 ; SPARC64-NEXT:    srl %g2, 0, %g2
-; SPARC64-NEXT:    add %g3, %g2, %l0
-; SPARC64-NEXT:    mov %i3, %o0
-; SPARC64-NEXT:    mov %l2, %o1
-; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    add %g4, %g2, %l0
+; SPARC64-NEXT:    mov %i5, %o0
+; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %g3, %i0
+; SPARC64-NEXT:    mov %i3, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %o0, %l0, %i5
+; SPARC64-NEXT:    add %o1, %i4, %i4
+; SPARC64-NEXT:    cmp %i4, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %i5, %i3, %i3
+; SPARC64-NEXT:    srax %i0, 63, %i5
+; SPARC64-NEXT:    xor %i3, %i5, %i3
+; SPARC64-NEXT:    xor %i4, %i5, %i4
+; SPARC64-NEXT:    ba .LBB0_7
+; SPARC64-NEXT:    or %i4, %i3, %i3
+; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-NEXT:    cmp %i2, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_8
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i1
+; SPARC64-NEXT:    mov %g0, %l0
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    movrnz %i4, 1, %i1
+; SPARC64-NEXT:    srl %i1, 0, %i1
+; SPARC64-NEXT:    add %i0, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i0, %g3
+; SPARC64-NEXT:    movrlz %i0, %i1, %g3
+; SPARC64-NEXT:    sub %g0, %i4, %i1
+; SPARC64-NEXT:    mov %i4, %g4
+; SPARC64-NEXT:    movrlz %i0, %i1, %g4
+; SPARC64-NEXT:    movrlz %i0, 1, %i5
+; SPARC64-NEXT:    movrlz %i0, %g4, %i4
+; SPARC64-NEXT:    movrlz %i0, %g3, %i0
+; SPARC64-NEXT:    movrlz %i2, 1, %l0
+; SPARC64-NEXT:    sub %g0, %i3, %i1
+; SPARC64-NEXT:    mov %i3, %g3
+; SPARC64-NEXT:    movrlz %i2, %i1, %g3
+; SPARC64-NEXT:    movrnz %i3, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i1
+; SPARC64-NEXT:    add %i2, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i2, %g2
+; SPARC64-NEXT:    movrlz %i2, %i1, %g2
+; SPARC64-NEXT:    movrlz %i2, %g3, %i3
+; SPARC64-NEXT:    movrlz %i2, %g2, %i2
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i1
+; SPARC64-NEXT:    mov %o1, %i3
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %i1, %o1, %g3
+; SPARC64-NEXT:    cmp %g3, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i0
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    add %o0, %i0, %g4
+; SPARC64-NEXT:    xor %l0, %i5, %i0
+; SPARC64-NEXT:    and %i0, 1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i5
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    xor %i3, %i5, %i1
+; SPARC64-NEXT:    add %i1, %i0, %i1
+; SPARC64-NEXT:    cmp %i1, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    ba .LBB0_6
+; SPARC64-NEXT:    srl %i4, 0, %i3
+; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i1
+; SPARC64-NEXT:    mov %g0, %l0
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    movrnz %i3, 1, %i1
+; SPARC64-NEXT:    srl %i1, 0, %i1
+; SPARC64-NEXT:    add %i2, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i2, %g3
+; SPARC64-NEXT:    movrlz %i2, %i1, %g3
+; SPARC64-NEXT:    sub %g0, %i3, %i1
+; SPARC64-NEXT:    mov %i3, %g4
+; SPARC64-NEXT:    movrlz %i2, %i1, %g4
+; SPARC64-NEXT:    movrlz %i2, 1, %i5
+; SPARC64-NEXT:    movrlz %i2, %g4, %i3
+; SPARC64-NEXT:    movrlz %i2, %g3, %i2
+; SPARC64-NEXT:    movrlz %i0, 1, %l0
+; SPARC64-NEXT:    sub %g0, %i4, %i1
+; SPARC64-NEXT:    mov %i4, %g3
+; SPARC64-NEXT:    movrlz %i0, %i1, %g3
+; SPARC64-NEXT:    movrnz %i4, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i1
+; SPARC64-NEXT:    add %i0, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i0, %g2
+; SPARC64-NEXT:    movrlz %i0, %i1, %g2
+; SPARC64-NEXT:    movrlz %i0, %g3, %i4
+; SPARC64-NEXT:    movrlz %i0, %g2, %i0
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i1
+; SPARC64-NEXT:    mov %o1, %i4
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i0, %o3
+; SPARC64-NEXT:    mov %g0, %i0
 ; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    add %o0, %l0, %i4
-; SPARC64-NEXT:    add %o1, %i5, %i5
-; SPARC64-NEXT:    cmp %i5, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i2
-; SPARC64-NEXT:    srl %i2, 0, %i2
-; SPARC64-NEXT:    add %i4, %i2, %i2
-; SPARC64-NEXT:    srax %i0, 63, %i4
-; SPARC64-NEXT:    xor %i2, %i4, %i2
-; SPARC64-NEXT:    xor %i5, %i4, %i4
-; SPARC64-NEXT:    or %i4, %i2, %i2
-; SPARC64-NEXT:    movrnz %i2, 1, %i3
-; SPARC64-NEXT:    srl %i3, 0, %i2
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %i1, %o1, %g3
+; SPARC64-NEXT:    cmp %g3, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i0
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    add %o0, %i0, %g4
+; SPARC64-NEXT:    xor %i5, %l0, %i0
+; SPARC64-NEXT:    and %i0, 1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i5
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    xor %i4, %i5, %i1
+; SPARC64-NEXT:    add %i1, %i0, %i1
+; SPARC64-NEXT:    cmp %i1, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
+; SPARC64-NEXT:    xor %g3, %i5, %i0
+; SPARC64-NEXT:    add %i0, %i3, %i0
+; SPARC64-NEXT:    cmp %i0, %i3
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i3
+; SPARC64-NEXT:    xor %g4, %i5, %i4
+; SPARC64-NEXT:    add %i4, %i3, %i3
+; SPARC64-NEXT:  .LBB0_7: ! %overflow.res
+; SPARC64-NEXT:    ba .LBB0_9
+; SPARC64-NEXT:    movrnz %i3, 1, %i2
+; SPARC64-NEXT:  .LBB0_8: ! %overflow.no
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i2, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i0
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:  .LBB0_9: ! %overflow.res
+; SPARC64-NEXT:    and %i2, 1, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %i4
-; SPARC64-VIS3-NEXT:    srax %i0, 63, %g2
-; SPARC64-VIS3-NEXT:    mulx %g2, %i3, %g3
-; SPARC64-VIS3-NEXT:    add %i4, %g3, %i4
+; SPARC64-VIS3-NEXT:    srax %i1, 63, %i4
+; SPARC64-VIS3-NEXT:    cmp %i0, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_3
+; SPARC64-VIS3-NEXT:    srax %i3, 63, %i4
+; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-VIS3-NEXT:    cmp %i2, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_5
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    srax %i0, 63, %i5
+; SPARC64-VIS3-NEXT:    mulx %i5, %i3, %g2
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %g2, %g2
 ; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
 ; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
 ; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
-; SPARC64-VIS3-NEXT:    addxccc %i4, %g0, %g4
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
-; SPARC64-VIS3-NEXT:    srax %i2, 63, %g5
-; SPARC64-VIS3-NEXT:    mulx %i1, %g5, %l0
-; SPARC64-VIS3-NEXT:    add %i4, %l0, %l0
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i4
-; SPARC64-VIS3-NEXT:    addcc %i4, %g3, %i4
-; SPARC64-VIS3-NEXT:    addxccc %l0, %g0, %g3
-; SPARC64-VIS3-NEXT:    srax %g3, 63, %l0
-; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
-; SPARC64-VIS3-NEXT:    srax %g4, 63, %g4
-; SPARC64-VIS3-NEXT:    addxccc %g4, %l0, %g4
-; SPARC64-VIS3-NEXT:    and %g5, %i0, %g5
-; SPARC64-VIS3-NEXT:    and %g2, %i2, %g2
-; SPARC64-VIS3-NEXT:    add %g2, %g5, %g2
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g5
-; SPARC64-VIS3-NEXT:    sub %g5, %g2, %g2
-; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    addcc %i0, %g3, %i0
-; SPARC64-VIS3-NEXT:    addxccc %g2, %g4, %i2
-; SPARC64-VIS3-NEXT:    srax %i4, 63, %g2
+; SPARC64-VIS3-NEXT:    addxccc %g2, %g0, %g2
+; SPARC64-VIS3-NEXT:    srax %i2, 63, %g4
+; SPARC64-VIS3-NEXT:    mulx %i1, %g4, %g5
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %l0
+; SPARC64-VIS3-NEXT:    add %l0, %g5, %g5
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %l0
+; SPARC64-VIS3-NEXT:    addcc %l0, %g3, %g3
+; SPARC64-VIS3-NEXT:    addxccc %g5, %g0, %g5
+; SPARC64-VIS3-NEXT:    srax %g5, 63, %l0
+; SPARC64-VIS3-NEXT:    addcc %g2, %g5, %g5
+; SPARC64-VIS3-NEXT:    srax %g2, 63, %g2
+; SPARC64-VIS3-NEXT:    addxccc %g2, %l0, %g2
+; SPARC64-VIS3-NEXT:    and %g4, %i0, %g4
+; SPARC64-VIS3-NEXT:    and %i5, %i2, %i5
+; SPARC64-VIS3-NEXT:    add %i5, %g4, %i5
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g4
+; SPARC64-VIS3-NEXT:    sub %g4, %i5, %i5
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i2
+; SPARC64-VIS3-NEXT:    mov %g3, %i0
+; SPARC64-VIS3-NEXT:    addcc %i2, %g5, %i2
+; SPARC64-VIS3-NEXT:    addxccc %i5, %g2, %i5
+; SPARC64-VIS3-NEXT:    srax %g3, 63, %g2
+; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i5
 ; SPARC64-VIS3-NEXT:    xor %i2, %g2, %i2
-; SPARC64-VIS3-NEXT:    xor %i0, %g2, %i0
-; SPARC64-VIS3-NEXT:    or %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:    or %i2, %i5, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_7
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-VIS3-NEXT:    cmp %i2, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_6
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mov %g0, %g3
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %g4
+; SPARC64-VIS3-NEXT:    mov %g0, %g5
+; SPARC64-VIS3-NEXT:    mov %g0, %l0
+; SPARC64-VIS3-NEXT:    mov %g0, %l1
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %l2
+; SPARC64-VIS3-NEXT:    mov %i1, %l3
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %l3
+; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g3
+; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i0, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l3, %i1
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i0
+; SPARC64-VIS3-NEXT:    sub %g0, %i3, %g3
+; SPARC64-VIS3-NEXT:    mov %i3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g4
+; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i2, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %g2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g4, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
+; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %g4
+; SPARC64-VIS3-NEXT:    add %g4, %i0, %i0
+; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i1
+; SPARC64-VIS3-NEXT:    add %g3, %i1, %i2
+; SPARC64-VIS3-NEXT:    cmp %i2, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
+; SPARC64-VIS3-NEXT:    srl %g5, 0, %i1
+; SPARC64-VIS3-NEXT:    add %i0, %i1, %g3
+; SPARC64-VIS3-NEXT:    xor %g2, %i5, %i0
+; SPARC64-VIS3-NEXT:    and %i0, 1, %i1
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %i5
+; SPARC64-VIS3-NEXT:    srl %i0, 0, %i0
+; SPARC64-VIS3-NEXT:    xor %i3, %i5, %i1
+; SPARC64-VIS3-NEXT:    add %i1, %i0, %i1
+; SPARC64-VIS3-NEXT:    cmp %i1, %i0
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
+; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i0
+; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
+; SPARC64-VIS3-NEXT:    srl %l1, 0, %i2
+; SPARC64-VIS3-NEXT:    xor %g3, %i5, %i3
+; SPARC64-VIS3-NEXT:    add %i3, %i2, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mov %g0, %g3
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %g4
+; SPARC64-VIS3-NEXT:    mov %g0, %g5
+; SPARC64-VIS3-NEXT:    mov %g0, %l0
+; SPARC64-VIS3-NEXT:    mov %g0, %l1
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    sub %g0, %i3, %l2
+; SPARC64-VIS3-NEXT:    mov %i3, %l3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %l3
+; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g3
+; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i2, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %i5
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l3, %i3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i2
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %g3
+; SPARC64-VIS3-NEXT:    mov %i1, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g4
+; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i0, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %g2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i1
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g4, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %g4
+; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %g4
+; SPARC64-VIS3-NEXT:    add %g4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
+; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
+; SPARC64-VIS3-NEXT:    add %g3, %i0, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
+; SPARC64-VIS3-NEXT:    srl %g5, 0, %i3
+; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
+; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i3
+; SPARC64-VIS3-NEXT:    and %i3, 1, %i5
+; SPARC64-VIS3-NEXT:    sub %g0, %i5, %i5
+; SPARC64-VIS3-NEXT:    srl %i3, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i1, %i5, %i1
+; SPARC64-VIS3-NEXT:    add %i1, %i3, %i1
+; SPARC64-VIS3-NEXT:    cmp %i1, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
+; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i0, %i5, %i0
+; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
+; SPARC64-VIS3-NEXT:    srl %l1, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i2
+; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i5
+; SPARC64-VIS3-NEXT:    add %i5, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
+; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:    srl %i5, 0, %i2
+; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-VIS3-NEXT:    and %i4, 1, %i2
 ; SPARC64-VIS3-NEXT:    ret
-; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
+; SPARC64-VIS3-NEXT:    restore
 start:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 6d197c88bfecd..4533523f97d74 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -5,207 +5,470 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %overflow.entry
 ; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:    ld [%fp+96], %l1
+; SPARC-NEXT:    ld [%fp+92], %g4
+; SPARC-NEXT:    or %i1, %i0, %l0
+; SPARC-NEXT:    cmp %l0, 0
 ; SPARC-NEXT:    mov %i3, %g2
-; SPARC-NEXT:    mov %i2, %g4
-; SPARC-NEXT:    umul %i2, %i5, %i2
+; SPARC-NEXT:    be .LBB0_33
+; SPARC-NEXT:    mov %i2, %g3
+; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC-NEXT:    or %i5, %i4, %l2
+; SPARC-NEXT:    cmp %l2, 0
+; SPARC-NEXT:    be .LBB0_40
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.2: ! %overflow
+; SPARC-NEXT:    umul %g3, %i5, %i2
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    ld [%fp+92], %l4
-; SPARC-NEXT:    umul %i4, %i3, %i3
-; SPARC-NEXT:    rd %y, %o1
-; SPARC-NEXT:    ld [%fp+96], %g3
-; SPARC-NEXT:    umul %i5, %g2, %l3
+; SPARC-NEXT:    umul %i4, %g2, %i3
+; SPARC-NEXT:    rd %y, %o2
+; SPARC-NEXT:    umul %i5, %g2, %l5
 ; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    umul %l4, %i1, %l2
-; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    umul %g4, %i1, %l4
+; SPARC-NEXT:    rd %y, %l3
 ; SPARC-NEXT:    add %i3, %i2, %i2
-; SPARC-NEXT:    umul %i0, %g3, %i3
+; SPARC-NEXT:    umul %i0, %l1, %i3
 ; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    add %o0, %i2, %o2
-; SPARC-NEXT:    umul %i1, %g3, %i2
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    add %i3, %l2, %i3
-; SPARC-NEXT:    add %l0, %i3, %l2
-; SPARC-NEXT:    addcc %i2, %l3, %l3
-; SPARC-NEXT:    umul %g2, %g3, %i3
+; SPARC-NEXT:    add %o0, %i2, %o1
+; SPARC-NEXT:    umul %i1, %l1, %i1
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    add %i3, %l4, %i2
+; SPARC-NEXT:    add %i5, %i2, %l4
+; SPARC-NEXT:    addcc %i1, %l5, %i1
+; SPARC-NEXT:    umul %g2, %l1, %i3
 ; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l2, %o2, %o4
-; SPARC-NEXT:    umul %g4, %g3, %g3
+; SPARC-NEXT:    addxcc %l4, %o1, %o4
+; SPARC-NEXT:    umul %g3, %l1, %l1
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addcc %g3, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %g3
-; SPARC-NEXT:    umul %g2, %l4, %g2
+; SPARC-NEXT:    addcc %l1, %i2, %i2
+; SPARC-NEXT:    addxcc %l5, 0, %l1
+; SPARC-NEXT:    umul %g2, %g4, %g2
 ; SPARC-NEXT:    rd %y, %l5
 ; SPARC-NEXT:    addcc %g2, %i2, %i2
 ; SPARC-NEXT:    addxcc %l5, 0, %g2
-; SPARC-NEXT:    addcc %g3, %g2, %g2
-; SPARC-NEXT:    addxcc %g0, 0, %g3
-; SPARC-NEXT:    umul %g4, %l4, %l5
+; SPARC-NEXT:    addcc %l1, %g2, %g2
+; SPARC-NEXT:    addxcc %g0, 0, %l1
+; SPARC-NEXT:    umul %g3, %g4, %l5
 ; SPARC-NEXT:    rd %y, %o3
 ; SPARC-NEXT:    addcc %l5, %g2, %l5
-; SPARC-NEXT:    addxcc %o3, %g3, %o3
-; SPARC-NEXT:    addcc %l5, %l3, %g2
-; SPARC-NEXT:    addxcc %o3, %o4, %g3
-; SPARC-NEXT:    mov 1, %l3
-; SPARC-NEXT:    cmp %g3, %o3
-; SPARC-NEXT:    bcs .LBB0_2
-; SPARC-NEXT:    mov %l3, %o4
-; SPARC-NEXT:  ! %bb.1: ! %start
-; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_2: ! %start
-; SPARC-NEXT:    cmp %g2, %l5
+; SPARC-NEXT:    addxcc %o3, %l1, %o3
+; SPARC-NEXT:    addcc %l5, %i1, %i1
+; SPARC-NEXT:    addxcc %o3, %o4, %g2
+; SPARC-NEXT:    mov 1, %l1
+; SPARC-NEXT:    cmp %g2, %o3
 ; SPARC-NEXT:    bcs .LBB0_4
-; SPARC-NEXT:    mov %l3, %l5
-; SPARC-NEXT:  ! %bb.3: ! %start
+; SPARC-NEXT:    mov %l1, %o4
+; SPARC-NEXT:  ! %bb.3: ! %overflow
+; SPARC-NEXT:    mov %g0, %o4
+; SPARC-NEXT:  .LBB0_4: ! %overflow
+; SPARC-NEXT:    cmp %i1, %l5
+; SPARC-NEXT:    bcs .LBB0_6
+; SPARC-NEXT:    mov %l1, %l5
+; SPARC-NEXT:  ! %bb.5: ! %overflow
 ; SPARC-NEXT:    mov %g0, %l5
-; SPARC-NEXT:  .LBB0_4: ! %start
-; SPARC-NEXT:    cmp %g3, %o3
-; SPARC-NEXT:    be .LBB0_6
+; SPARC-NEXT:  .LBB0_6: ! %overflow
+; SPARC-NEXT:    cmp %g2, %o3
+; SPARC-NEXT:    be .LBB0_8
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.5: ! %start
+; SPARC-NEXT:  ! %bb.7: ! %overflow
 ; SPARC-NEXT:    mov %o4, %l5
-; SPARC-NEXT:  .LBB0_6: ! %start
-; SPARC-NEXT:    cmp %g4, 0
-; SPARC-NEXT:    bne .LBB0_8
-; SPARC-NEXT:    mov %l3, %o3
-; SPARC-NEXT:  ! %bb.7: ! %start
-; SPARC-NEXT:    mov %g0, %o3
-; SPARC-NEXT:  .LBB0_8: ! %start
+; SPARC-NEXT:  .LBB0_8: ! %overflow
 ; SPARC-NEXT:    cmp %i4, 0
 ; SPARC-NEXT:    bne .LBB0_10
-; SPARC-NEXT:    mov %l3, %o4
-; SPARC-NEXT:  ! %bb.9: ! %start
-; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_10: ! %start
-; SPARC-NEXT:    cmp %o1, 0
+; SPARC-NEXT:    mov %l1, %o3
+; SPARC-NEXT:  ! %bb.9: ! %overflow
+; SPARC-NEXT:    mov %g0, %o3
+; SPARC-NEXT:  .LBB0_10: ! %overflow
+; SPARC-NEXT:    cmp %g3, 0
 ; SPARC-NEXT:    bne .LBB0_12
-; SPARC-NEXT:    mov %l3, %o1
-; SPARC-NEXT:  ! %bb.11: ! %start
-; SPARC-NEXT:    mov %g0, %o1
-; SPARC-NEXT:  .LBB0_12: ! %start
-; SPARC-NEXT:    cmp %l7, 0
+; SPARC-NEXT:    mov %l1, %o4
+; SPARC-NEXT:  ! %bb.11: ! %overflow
+; SPARC-NEXT:    mov %g0, %o4
+; SPARC-NEXT:  .LBB0_12: ! %overflow
+; SPARC-NEXT:    cmp %o2, 0
 ; SPARC-NEXT:    bne .LBB0_14
-; SPARC-NEXT:    mov %l3, %l7
-; SPARC-NEXT:  ! %bb.13: ! %start
-; SPARC-NEXT:    mov %g0, %l7
-; SPARC-NEXT:  .LBB0_14: ! %start
-; SPARC-NEXT:    cmp %o2, %o0
-; SPARC-NEXT:    bcs .LBB0_16
-; SPARC-NEXT:    mov %l3, %g4
-; SPARC-NEXT:  ! %bb.15: ! %start
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_16: ! %start
-; SPARC-NEXT:    cmp %l4, 0
-; SPARC-NEXT:    bne .LBB0_18
-; SPARC-NEXT:    mov %l3, %l4
-; SPARC-NEXT:  ! %bb.17: ! %start
-; SPARC-NEXT:    mov %g0, %l4
-; SPARC-NEXT:  .LBB0_18: ! %start
+; SPARC-NEXT:    mov %l1, %o2
+; SPARC-NEXT:  ! %bb.13: ! %overflow
+; SPARC-NEXT:    mov %g0, %o2
+; SPARC-NEXT:  .LBB0_14: ! %overflow
+; SPARC-NEXT:    cmp %l7, 0
+; SPARC-NEXT:    bne .LBB0_16
+; SPARC-NEXT:    mov %l1, %g3
+; SPARC-NEXT:  ! %bb.15: ! %overflow
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_16: ! %overflow
+; SPARC-NEXT:    cmp %o1, %o0
+; SPARC-NEXT:    bcs .LBB0_18
+; SPARC-NEXT:    mov %l1, %i4
+; SPARC-NEXT:  ! %bb.17: ! %overflow
+; SPARC-NEXT:    mov %g0, %i4
+; SPARC-NEXT:  .LBB0_18: ! %overflow
 ; SPARC-NEXT:    cmp %i0, 0
 ; SPARC-NEXT:    bne .LBB0_20
-; SPARC-NEXT:    mov %l3, %o0
-; SPARC-NEXT:  ! %bb.19: ! %start
-; SPARC-NEXT:    mov %g0, %o0
-; SPARC-NEXT:  .LBB0_20: ! %start
-; SPARC-NEXT:    cmp %l6, 0
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.19: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_20: ! %overflow
+; SPARC-NEXT:    cmp %g4, 0
 ; SPARC-NEXT:    bne .LBB0_22
-; SPARC-NEXT:    mov %l3, %l6
-; SPARC-NEXT:  ! %bb.21: ! %start
-; SPARC-NEXT:    mov %g0, %l6
-; SPARC-NEXT:  .LBB0_22: ! %start
-; SPARC-NEXT:    and %o4, %o3, %o2
-; SPARC-NEXT:    cmp %l1, 0
-; SPARC-NEXT:    and %o0, %l4, %l4
+; SPARC-NEXT:    mov %l1, %l7
+; SPARC-NEXT:  ! %bb.21: ! %overflow
+; SPARC-NEXT:    mov %g0, %l7
+; SPARC-NEXT:  .LBB0_22: ! %overflow
+; SPARC-NEXT:    cmp %l6, 0
 ; SPARC-NEXT:    bne .LBB0_24
-; SPARC-NEXT:    mov %l3, %l1
-; SPARC-NEXT:  ! %bb.23: ! %start
-; SPARC-NEXT:    mov %g0, %l1
-; SPARC-NEXT:  .LBB0_24: ! %start
-; SPARC-NEXT:    or %o2, %o1, %o0
-; SPARC-NEXT:    cmp %l2, %l0
-; SPARC-NEXT:    or %l4, %l6, %l4
-; SPARC-NEXT:    bcs .LBB0_26
-; SPARC-NEXT:    mov %l3, %l0
-; SPARC-NEXT:  ! %bb.25: ! %start
-; SPARC-NEXT:    mov %g0, %l0
-; SPARC-NEXT:  .LBB0_26: ! %start
-; SPARC-NEXT:    or %o0, %l7, %l2
-; SPARC-NEXT:    or %i5, %i4, %i4
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    or %l4, %l1, %l1
-; SPARC-NEXT:    bne .LBB0_28
-; SPARC-NEXT:    mov %l3, %i4
-; SPARC-NEXT:  ! %bb.27: ! %start
-; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_28: ! %start
-; SPARC-NEXT:    or %l2, %g4, %i5
-; SPARC-NEXT:    or %i1, %i0, %i0
-; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    mov %l1, %g4
+; SPARC-NEXT:  ! %bb.23: ! %overflow
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_24: ! %overflow
+; SPARC-NEXT:    and %o3, %o4, %l6
+; SPARC-NEXT:    cmp %l3, 0
+; SPARC-NEXT:    and %i0, %l7, %l7
+; SPARC-NEXT:    bne .LBB0_26
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.25: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_26: ! %overflow
+; SPARC-NEXT:    or %l6, %o2, %l3
+; SPARC-NEXT:    cmp %l4, %i5
+; SPARC-NEXT:    or %l7, %g4, %g4
+; SPARC-NEXT:    bcs .LBB0_28
+; SPARC-NEXT:    mov %l1, %i5
+; SPARC-NEXT:  ! %bb.27: ! %overflow
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_28: ! %overflow
+; SPARC-NEXT:    or %l3, %g3, %g3
+; SPARC-NEXT:    cmp %l2, 0
+; SPARC-NEXT:    or %g4, %i0, %g4
 ; SPARC-NEXT:    bne .LBB0_30
-; SPARC-NEXT:    or %l1, %l0, %i0
-; SPARC-NEXT:  ! %bb.29: ! %start
-; SPARC-NEXT:    mov %g0, %l3
-; SPARC-NEXT:  .LBB0_30: ! %start
-; SPARC-NEXT:    and %l3, %i4, %i1
-; SPARC-NEXT:    or %i1, %i0, %i0
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.29: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_30: ! %overflow
+; SPARC-NEXT:    or %g3, %i4, %i4
+; SPARC-NEXT:    cmp %l0, 0
+; SPARC-NEXT:    bne .LBB0_32
+; SPARC-NEXT:    or %g4, %i5, %i5
+; SPARC-NEXT:  ! %bb.31: ! %overflow
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_32: ! %overflow
+; SPARC-NEXT:    and %l1, %i0, %i0
 ; SPARC-NEXT:    or %i0, %i5, %i0
+; SPARC-NEXT:    or %i0, %i4, %i0
+; SPARC-NEXT:    ba .LBB0_49
 ; SPARC-NEXT:    or %i0, %l5, %i0
+; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs
+; SPARC-NEXT:    or %i5, %i4, %i2
+; SPARC-NEXT:    cmp %i2, 0
+; SPARC-NEXT:    be .LBB0_48
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.34: ! %overflow.no.lhs.only
+; SPARC-NEXT:    umul %g3, %l1, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    umul %g2, %l1, %i3
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %i2, %l2, %i2
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %g2, %g4, %l2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addcc %l2, %i2, %i2
+; SPARC-NEXT:    addxcc %l3, 0, %l2
+; SPARC-NEXT:    addcc %l0, %l2, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l2
+; SPARC-NEXT:    umul %g3, %g4, %l3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %l1, %i0, %l3
+; SPARC-NEXT:    umul %l1, %i1, %l1
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addxcc %l4, %l2, %l2
+; SPARC-NEXT:    add %l5, %l3, %l3
+; SPARC-NEXT:    smul %g4, %i1, %g4
+; SPARC-NEXT:    add %l3, %g4, %g4
+; SPARC-NEXT:    addcc %l0, %l1, %l0
+; SPARC-NEXT:    umul %g2, %i5, %l1
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l2, %g4, %g4
+; SPARC-NEXT:    umul %g3, %i5, %l2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l2, %l3, %l2
+; SPARC-NEXT:    addxcc %l4, 0, %l3
+; SPARC-NEXT:    umul %g2, %i4, %g2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g2, %l2, %g2
+; SPARC-NEXT:    addxcc %l4, 0, %l2
+; SPARC-NEXT:    addcc %l3, %l2, %l2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %g3, %i4, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g3, %l2, %g3
+; SPARC-NEXT:    smul %i5, %i0, %i0
+; SPARC-NEXT:    umul %i5, %i1, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l2, %i0, %i0
+; SPARC-NEXT:    smul %i4, %i1, %i1
+; SPARC-NEXT:    add %i0, %i1, %i0
+; SPARC-NEXT:    addcc %g3, %i5, %i4
+; SPARC-NEXT:    addxcc %l3, %i0, %i5
+; SPARC-NEXT:    addcc %l0, %l1, %i1
+; SPARC-NEXT:    addxcc %g4, %g2, %g2
+; SPARC-NEXT:    mov 1, %i0
+; SPARC-NEXT:    cmp %g2, %g4
+; SPARC-NEXT:    bcs .LBB0_36
+; SPARC-NEXT:    mov %i0, %g3
+; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i1, %l0
+; SPARC-NEXT:    bcs .LBB0_38
+; SPARC-NEXT:    mov %i0, %l0
+; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %g2, %g4
+; SPARC-NEXT:    be .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.39: ! %overflow.no.lhs.only
+; SPARC-NEXT:    ba .LBB0_46
+; SPARC-NEXT:    mov %g3, %l0
+; SPARC-NEXT:  .LBB0_40: ! %overflow.no.rhs.only
+; SPARC-NEXT:    umul %g4, %g2, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    umul %l1, %g2, %i3
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %i2, %l2, %i2
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %l1, %g3, %l2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addcc %l2, %i2, %i2
+; SPARC-NEXT:    addxcc %l3, 0, %l2
+; SPARC-NEXT:    addcc %l0, %l2, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l2
+; SPARC-NEXT:    umul %g4, %g3, %l3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %g2, %i4, %l3
+; SPARC-NEXT:    umul %g2, %i5, %g2
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addxcc %l4, %l2, %l2
+; SPARC-NEXT:    add %l5, %l3, %l3
+; SPARC-NEXT:    smul %g3, %i5, %g3
+; SPARC-NEXT:    add %l3, %g3, %g3
+; SPARC-NEXT:    addcc %l0, %g2, %l0
+; SPARC-NEXT:    umul %l1, %i1, %g2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l2, %g3, %g3
+; SPARC-NEXT:    umul %g4, %i1, %l2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l2, %l3, %l2
+; SPARC-NEXT:    addxcc %l4, 0, %l3
+; SPARC-NEXT:    umul %l1, %i0, %l1
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l1, %l2, %l1
+; SPARC-NEXT:    addxcc %l4, 0, %l2
+; SPARC-NEXT:    addcc %l3, %l2, %l2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %g4, %i0, %g4
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g4, %l2, %g4
+; SPARC-NEXT:    smul %i1, %i4, %i4
+; SPARC-NEXT:    umul %i1, %i5, %i1
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l2, %i4, %i4
+; SPARC-NEXT:    smul %i0, %i5, %i0
+; SPARC-NEXT:    add %i4, %i0, %i0
+; SPARC-NEXT:    addcc %g4, %i1, %i4
+; SPARC-NEXT:    addxcc %l3, %i0, %i5
+; SPARC-NEXT:    addcc %l0, %g2, %i1
+; SPARC-NEXT:    addxcc %g3, %l1, %g2
+; SPARC-NEXT:    mov 1, %i0
+; SPARC-NEXT:    cmp %g2, %g3
+; SPARC-NEXT:    bcs .LBB0_42
+; SPARC-NEXT:    mov %i0, %g4
+; SPARC-NEXT:  ! %bb.41: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_42: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i1, %l0
+; SPARC-NEXT:    bcs .LBB0_44
+; SPARC-NEXT:    mov %i0, %l0
+; SPARC-NEXT:  ! %bb.43: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %g2, %g3
+; SPARC-NEXT:    be .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.45: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g4, %l0
+; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
+; SPARC-NEXT:    addcc %i4, %l0, %i4
+; SPARC-NEXT:    addxcc %i5, 0, %i5
+; SPARC-NEXT:    or %i4, %i5, %i4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bne .LBB0_49
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.47: ! %overflow.no.rhs.only
+; SPARC-NEXT:    ba .LBB0_49
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_48: ! %overflow.no
+; SPARC-NEXT:    smul %l1, %i0, %i3
+; SPARC-NEXT:    umul %l1, %i1, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:    add %l0, %i3, %i3
+; SPARC-NEXT:    smul %g4, %i1, %i1
+; SPARC-NEXT:    smul %i5, %g3, %l0
+; SPARC-NEXT:    umul %i5, %g2, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    add %i3, %i1, %i1
+; SPARC-NEXT:    add %l2, %l0, %i3
+; SPARC-NEXT:    smul %i4, %g2, %i4
+; SPARC-NEXT:    add %i3, %i4, %i4
+; SPARC-NEXT:    addcc %i5, %i2, %i5
+; SPARC-NEXT:    umul %g2, %l1, %i3
+; SPARC-NEXT:    rd %y, %i2
+; SPARC-NEXT:    addxcc %i4, %i1, %i4
+; SPARC-NEXT:    umul %g3, %l1, %i1
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %i1, %i2, %i1
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %g2, %g4, %i2
+; SPARC-NEXT:    rd %y, %g2
+; SPARC-NEXT:    addcc %i2, %i1, %i2
+; SPARC-NEXT:    addxcc %g2, 0, %i1
+; SPARC-NEXT:    addcc %l0, %i1, %i1
+; SPARC-NEXT:    addxcc %g0, 0, %g2
+; SPARC-NEXT:    umul %g3, %g4, %g3
+; SPARC-NEXT:    rd %y, %g4
+; SPARC-NEXT:    addcc %g3, %i1, %i1
+; SPARC-NEXT:    addxcc %g4, %g2, %g2
+; SPARC-NEXT:    addcc %i1, %i5, %i1
+; SPARC-NEXT:    addxcc %g2, %i4, %g2
+; SPARC-NEXT:  .LBB0_49: ! %overflow.res
 ; SPARC-NEXT:    and %i0, 1, %i4
-; SPARC-NEXT:    mov %g3, %i0
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o1
+; SPARC-NEXT:    restore %g0, %g2, %o0
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %start
+; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    mov %i0, %l1
+; SPARC64-NEXT:    brz %i0, .LBB0_3
+; SPARC64-NEXT:    mov %i1, %i4
+; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-NEXT:    brz %i2, .LBB0_5
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %overflow
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i1, %o3
-; SPARC64-NEXT:    mov %o0, %i4
-; SPARC64-NEXT:    mov %o1, %i5
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
-; SPARC64-NEXT:    add %o1, %i5, %i0
+; SPARC64-NEXT:    add %o1, %i1, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i1, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %g0, %i1
-; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i4
 ; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %g3
-; SPARC64-NEXT:    add %o0, %i0, %i0
-; SPARC64-NEXT:    cmp %i0, %o0
-; SPARC64-NEXT:    movrnz %l0, 1, %i3
-; SPARC64-NEXT:    movrnz %i2, 1, %i5
-; SPARC64-NEXT:    movrnz %l1, 1, %g2
-; SPARC64-NEXT:    movcs %xcc, 1, %i1
-; SPARC64-NEXT:    and %g2, %i5, %i2
-; SPARC64-NEXT:    or %i2, %i3, %i2
-; SPARC64-NEXT:    movrnz %i4, 1, %g3
-; SPARC64-NEXT:    or %i2, %g3, %i2
-; SPARC64-NEXT:    or %i2, %i1, %i1
-; SPARC64-NEXT:    srl %i1, 0, %i2
+; SPARC64-NEXT:    mov %g0, %g4
+; SPARC64-NEXT:    mov %g0, %g5
+; SPARC64-NEXT:    add %o0, %l1, %i3
+; SPARC64-NEXT:    cmp %i3, %o0
+; SPARC64-NEXT:    movrnz %i2, 1, %g2
+; SPARC64-NEXT:    movrnz %i0, 1, %g3
+; SPARC64-NEXT:    and %g3, %g2, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    movrnz %l0, 1, %g4
+; SPARC64-NEXT:    or %i0, %g4, %i0
+; SPARC64-NEXT:    movrnz %i5, 1, %g5
+; SPARC64-NEXT:    or %i0, %g5, %i0
+; SPARC64-NEXT:    ba .LBB0_8
+; SPARC64-NEXT:    or %i0, %i4, %i0
+; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-NEXT:    brz %i2, .LBB0_7
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    add %i5, %o1, %i3
+; SPARC64-NEXT:    ba .LBB0_6
+; SPARC64-NEXT:    cmp %i3, %i5
+; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i4
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i0, %o3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    add %i4, %o1, %i3
+; SPARC64-NEXT:    cmp %i3, %i4
+; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %o0, %i2, %i2
+; SPARC64-NEXT:    ba .LBB0_8
+; SPARC64-NEXT:    movrnz %i2, 1, %i0
+; SPARC64-NEXT:  .LBB0_7: ! %overflow.no
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i2, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i3
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-NEXT:    and %i0, 1, %i2
 ; SPARC64-NEXT:    ret
-; SPARC64-NEXT:    restore %g0, %o1, %o1
+; SPARC64-NEXT:    restore %g0, %i3, %o0
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
+; SPARC64-VIS3-NEXT:    brz %i0, .LBB0_3
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_5
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
 ; SPARC64-VIS3-NEXT:    mov %g0, %i5
 ; SPARC64-VIS3-NEXT:    mov %g0, %g2
 ; SPARC64-VIS3-NEXT:    mov %g0, %g3
@@ -227,9 +490,59 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-VIS3-NEXT:    umulxhi %i2, %i1, %i2
 ; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %g5
 ; SPARC64-VIS3-NEXT:    or %i0, %g5, %i0
-; SPARC64-VIS3-NEXT:    or %i0, %i5, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_7
+; SPARC64-VIS3-NEXT:    or %i0, %i5, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_6
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i4
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i0, %i0
+; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    mov %i3, %i1
+; SPARC64-VIS3-NEXT:    add %g3, %i2, %i4
+; SPARC64-VIS3-NEXT:    cmp %i4, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-VIS3-NEXT:    srl %g2, 0, %i2
+; SPARC64-VIS3-NEXT:    add %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %i4
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
+; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
+; SPARC64-VIS3-NEXT:    add %g3, %i0, %i4
+; SPARC64-VIS3-NEXT:    cmp %i4, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-VIS3-NEXT:    srl %g2, 0, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:    srl %i0, 0, %i2
+; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-VIS3-NEXT:    and %i5, 1, %i2
 ; SPARC64-VIS3-NEXT:    ret
 ; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
 start:
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index 9b5fa1c2bc811..c19ce3f34011e 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -3,200 +3,568 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-LABEL: muloti_test:
-; THUMBV6:       @ %bb.0: @ %start
+; THUMBV6:       @ %bb.0: @ %overflow.entry
 ; THUMBV6-NEXT:    .save {r4, r5, r6, r7, lr}
 ; THUMBV6-NEXT:    push {r4, r5, r6, r7, lr}
-; THUMBV6-NEXT:    .pad #60
-; THUMBV6-NEXT:    sub sp, #60
+; THUMBV6-NEXT:    .pad #84
+; THUMBV6-NEXT:    sub sp, #84
 ; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    mov r1, r2
-; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    str r0, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #108]
+; THUMBV6-NEXT:    ldr r5, [sp, #104]
+; THUMBV6-NEXT:    str r5, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    str r0, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    orrs r5, r0
+; THUMBV6-NEXT:    ldr r1, [sp, #124]
+; THUMBV6-NEXT:    ldr r4, [sp, #120]
+; THUMBV6-NEXT:    ldr r0, [sp, #116]
+; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r3, [sp, #112]
+; THUMBV6-NEXT:    str r4, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    str r2, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    str r6, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    str r3, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    bne .LBB0_1
+; THUMBV6-NEXT:    b .LBB0_3
+; THUMBV6-NEXT:  .LBB0_1: @ %overflow.lhs
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    bne .LBB0_2
+; THUMBV6-NEXT:    b .LBB0_5
+; THUMBV6-NEXT:  .LBB0_2: @ %overflow
+; THUMBV6-NEXT:    str r4, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
 ; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #88]
-; THUMBV6-NEXT:    str r2, [sp, #48] @ 4-byte Spill
-; THUMBV6-NEXT:    movs r5, #0
-; THUMBV6-NEXT:    mov r0, r1
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r1, r0
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    adcs r1, r4
 ; THUMBV6-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    str r0, [r4]
-; THUMBV6-NEXT:    ldr r2, [sp, #96]
-; THUMBV6-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r6
-; THUMBV6-NEXT:    str r6, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; THUMBV6-NEXT:    str r5, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r7, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r2, r1, r2
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r2, r0
+; THUMBV6-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    subs r0, r1, #1
-; THUMBV6-NEXT:    sbcs r7, r0
-; THUMBV6-NEXT:    ldr r0, [sp, #100]
-; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r6, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r1, #1
-; THUMBV6-NEXT:    sbcs r1, r2
-; THUMBV6-NEXT:    subs r2, r4, #1
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    str r1, [sp] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r7, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r7
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r0
+; THUMBV6-NEXT:    adcs r4, r4
+; THUMBV6-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
 ; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r4, #1
-; THUMBV6-NEXT:    sbcs r4, r2
-; THUMBV6-NEXT:    ands r4, r3
-; THUMBV6-NEXT:    orrs r4, r1
-; THUMBV6-NEXT:    orrs r4, r7
-; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r7, r1, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r0, #1
+; THUMBV6-NEXT:    sbcs r0, r2
+; THUMBV6-NEXT:    subs r2, r7, #1
+; THUMBV6-NEXT:    sbcs r7, r2
+; THUMBV6-NEXT:    mov r6, r7
+; THUMBV6-NEXT:    ldr r7, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r7, #1
+; THUMBV6-NEXT:    sbcs r7, r2
+; THUMBV6-NEXT:    ands r7, r6
+; THUMBV6-NEXT:    orrs r7, r0
+; THUMBV6-NEXT:    orrs r7, r3
+; THUMBV6-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r7, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r0, #1
+; THUMBV6-NEXT:    sbcs r0, r2
+; THUMBV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    subs r2, r5, #1
+; THUMBV6-NEXT:    sbcs r5, r2
+; THUMBV6-NEXT:    ands r5, r3
+; THUMBV6-NEXT:    orrs r5, r6
+; THUMBV6-NEXT:    orrs r5, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r5, r2
+; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r2, #1
+; THUMBV6-NEXT:    ldr r3, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    str r3, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ands r2, r6
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r5
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r5, r7
+; THUMBV6-NEXT:    orrs r5, r4
+; THUMBV6-NEXT:    b .LBB0_8
+; THUMBV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    orrs r0, r1
+; THUMBV6-NEXT:    bne .LBB0_4
+; THUMBV6-NEXT:    b .LBB0_7
+; THUMBV6-NEXT:  .LBB0_4: @ %overflow.no.lhs.only
+; THUMBV6-NEXT:    mov r5, r4
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r2
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    str r5, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r4
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
 ; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    ldr r5, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    adds r0, r1, r7
-; THUMBV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r5
-; THUMBV6-NEXT:    adcs r0, r5
-; THUMBV6-NEXT:    orrs r0, r4
-; THUMBV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #92]
+; THUMBV6-NEXT:    str r1, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r3, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r2, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r3
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r2, r6
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r7, [sp, #80]
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r6
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    adcs r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    b .LBB0_6
+; THUMBV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r3
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r5, r3
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    subs r0, r1, #1
-; THUMBV6-NEXT:    sbcs r4, r0
-; THUMBV6-NEXT:    ldr r6, [sp, #84]
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r7, r0, r1
+; THUMBV6-NEXT:    adcs r6, r4
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r7
+; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
 ; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r1, #1
-; THUMBV6-NEXT:    sbcs r1, r2
-; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r6, #1
-; THUMBV6-NEXT:    sbcs r6, r2
-; THUMBV6-NEXT:    ands r6, r3
-; THUMBV6-NEXT:    orrs r6, r1
-; THUMBV6-NEXT:    orrs r6, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r1, r0
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r5, r1
+; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r4, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
 ; THUMBV6-NEXT:    mov r2, r4
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r1, r0
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    adcs r1, r5
-; THUMBV6-NEXT:    orrs r1, r6
-; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r2, r0, r2
+; THUMBV6-NEXT:    adcs r5, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r5, r0
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
+; THUMBV6-NEXT:    add r2, sp, #72
+; THUMBV6-NEXT:    ldm r2, {r0, r1, r2} @ 12-byte Folded Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    adcs r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
 ; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r3, r2
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r7, r2
-; THUMBV6-NEXT:    subs r2, r7, #1
-; THUMBV6-NEXT:    sbcs r7, r2
-; THUMBV6-NEXT:    ands r7, r3
-; THUMBV6-NEXT:    orrs r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r1, r2, r1
-; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV6-NEXT:    adcs r2, r4
+; THUMBV6-NEXT:    adcs r5, r4
+; THUMBV6-NEXT:    orrs r5, r2
+; THUMBV6-NEXT:    subs r2, r5, #1
+; THUMBV6-NEXT:    sbcs r5, r2
+; THUMBV6-NEXT:    b .LBB0_8
+; THUMBV6-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV6-NEXT:    movs r5, #0
+; THUMBV6-NEXT:    mov r0, r2
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r2, r3
+; THUMBV6-NEXT:    mov r4, r3
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r6, r0, r1
-; THUMBV6-NEXT:    adcs r4, r5
-; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    str r0, [r2, #4]
-; THUMBV6-NEXT:    adcs r1, r5
-; THUMBV6-NEXT:    adds r0, r4, r1
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r6, r5
+; THUMBV6-NEXT:    mov r4, r6
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV6-NEXT:    adcs r6, r5
-; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r7
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
 ; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r6
-; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r5
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r6, r7
+; THUMBV6-NEXT:    mov r7, r5
+; THUMBV6-NEXT:    adcs r7, r5
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r6
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r0
 ; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r4, r7
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #76] @ 4-byte Reload
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
 ; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:  .LBB0_8: @ %overflow.res
+; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    str r3, [r2]
+; THUMBV6-NEXT:    ldr r3, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    str r3, [r2, #4]
 ; THUMBV6-NEXT:    str r0, [r2, #8]
-; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r0
 ; THUMBV6-NEXT:    str r1, [r2, #12]
-; THUMBV6-NEXT:    adcs r5, r5
-; THUMBV6-NEXT:    orrs r5, r7
 ; THUMBV6-NEXT:    movs r0, #1
 ; THUMBV6-NEXT:    ands r0, r5
 ; THUMBV6-NEXT:    strb r0, [r2, #16]
-; THUMBV6-NEXT:    add sp, #60
+; THUMBV6-NEXT:    add sp, #84
 ; THUMBV6-NEXT:    pop {r4, r5, r6, r7, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index fe1d06cb39e16..07cd9788d91e1 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -3,125 +3,211 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: muloti_test:
-; THUMBV7:       @ %bb.0: @ %start
+; THUMBV7:       @ %bb.0: @ %overflow.entry
 ; THUMBV7-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; THUMBV7-NEXT:    .pad #44
-; THUMBV7-NEXT:    sub sp, #44
-; THUMBV7-NEXT:    ldr.w r8, [sp, #88]
-; THUMBV7-NEXT:    mov r9, r0
-; THUMBV7-NEXT:    ldr r7, [sp, #96]
-; THUMBV7-NEXT:    ldr.w lr, [sp, #100]
-; THUMBV7-NEXT:    umull r0, r5, r2, r8
-; THUMBV7-NEXT:    ldr r4, [sp, #80]
-; THUMBV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r1, r0, r3, r7
-; THUMBV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r0, r11, lr, r2
-; THUMBV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r1, [sp, #92]
-; THUMBV7-NEXT:    str r0, [sp] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r0, r10, r7, r2
-; THUMBV7-NEXT:    mov r7, r1
-; THUMBV7-NEXT:    umull r6, r12, r1, r4
-; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r0, [sp, #84]
-; THUMBV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r6, r1, r0, r8
-; THUMBV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r6, r2, r2, r7
-; THUMBV7-NEXT:    mov r7, r4
-; THUMBV7-NEXT:    strd r6, r2, [sp, #8] @ 8-byte Folded Spill
-; THUMBV7-NEXT:    umull r2, r6, r4, r8
-; THUMBV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
-; THUMBV7-NEXT:    movs r6, #0
-; THUMBV7-NEXT:    str.w r2, [r9]
-; THUMBV7-NEXT:    umlal r5, r6, r3, r8
-; THUMBV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
-; THUMBV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; THUMBV7-NEXT:    add r4, r2
-; THUMBV7-NEXT:    adds.w r2, r10, r4
-; THUMBV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
-; THUMBV7-NEXT:    mov.w r2, #0
-; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    cmp.w r12, #0
-; THUMBV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
+; THUMBV7-NEXT:    .pad #12
+; THUMBV7-NEXT:    sub sp, #12
+; THUMBV7-NEXT:    ldrd r11, r6, [sp, #48]
+; THUMBV7-NEXT:    ldrd r10, r5, [sp, #64]
+; THUMBV7-NEXT:    ldrd r9, r12, [sp, #56]
+; THUMBV7-NEXT:    orrs.w r1, r11, r6
+; THUMBV7-NEXT:    beq .LBB0_3
+; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; THUMBV7-NEXT:    orr.w r4, r10, r5
+; THUMBV7-NEXT:    cmp r4, #0
+; THUMBV7-NEXT:    beq.w .LBB0_5
+; THUMBV7-NEXT:  @ %bb.2: @ %overflow
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r12, #1
+; THUMBV7-NEXT:    movne r4, #1
 ; THUMBV7-NEXT:    cmp r1, #0
-; THUMBV7-NEXT:    ldr r2, [sp, #96]
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
-; THUMBV7-NEXT:    orrs.w r10, r7, r0
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r10, #1
-; THUMBV7-NEXT:    orrs.w r7, r2, lr
-; THUMBV7-NEXT:    ldr r2, [sp, #92]
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r7, #1
-; THUMBV7-NEXT:    cmp r0, #0
+; THUMBV7-NEXT:    and.w lr, r1, r4
+; THUMBV7-NEXT:    umull r7, r4, r6, r9
+; THUMBV7-NEXT:    cmp.w r12, #0
+; THUMBV7-NEXT:    mov r1, r12
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r0, #1
-; THUMBV7-NEXT:    cmp r2, #0
-; THUMBV7-NEXT:    mov r4, r2
-; THUMBV7-NEXT:    mov r8, r2
+; THUMBV7-NEXT:    movne r1, #1
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r4, #1
-; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV7-NEXT:    ands r0, r4
-; THUMBV7-NEXT:    movs r4, #0
-; THUMBV7-NEXT:    adds r5, r5, r2
-; THUMBV7-NEXT:    str.w r5, [r9, #4]
-; THUMBV7-NEXT:    orr.w r0, r0, r1
-; THUMBV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; THUMBV7-NEXT:    and.w r5, r10, r7
-; THUMBV7-NEXT:    orr.w r0, r0, r12
-; THUMBV7-NEXT:    mov.w r12, #0
-; THUMBV7-NEXT:    add r1, r2
-; THUMBV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r2, r6
-; THUMBV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
-; THUMBV7-NEXT:    adc r7, r4, #0
-; THUMBV7-NEXT:    adds r1, r1, r6
-; THUMBV7-NEXT:    umlal r2, r7, r3, r8
-; THUMBV7-NEXT:    adc r4, r4, #0
-; THUMBV7-NEXT:    orrs r0, r4
-; THUMBV7-NEXT:    orrs r0, r5
-; THUMBV7-NEXT:    ldrd r5, r4, [sp, #36] @ 8-byte Folded Reload
-; THUMBV7-NEXT:    adds r5, r5, r4
-; THUMBV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r1, r4
-; THUMBV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    ands r1, r6
 ; THUMBV7-NEXT:    cmp r4, #0
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    orrs r1, r4
+; THUMBV7-NEXT:    umull r4, r6, r12, r11
+; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r6, r1
+; THUMBV7-NEXT:    adds r1, r7, r4
+; THUMBV7-NEXT:    umull r11, r4, r11, r9
+; THUMBV7-NEXT:    adds.w r8, r4, r1
+; THUMBV7-NEXT:    mov.w r1, #0
+; THUMBV7-NEXT:    adc r4, r1, #0
 ; THUMBV7-NEXT:    cmp r3, #0
+; THUMBV7-NEXT:    orr.w r4, r4, r6
+; THUMBV7-NEXT:    umull r7, r6, r5, r2
+; THUMBV7-NEXT:    orr.w lr, lr, r4
+; THUMBV7-NEXT:    mov r4, r3
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    cmp r5, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r5, #1
+; THUMBV7-NEXT:    ands r4, r5
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r3, #1
-; THUMBV7-NEXT:    cmp.w lr, #0
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r4, r6
+; THUMBV7-NEXT:    umull r5, r6, r3, r10
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w lr, #1
-; THUMBV7-NEXT:    cmp.w r11, #0
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r4, r6
+; THUMBV7-NEXT:    add r5, r7
+; THUMBV7-NEXT:    umull r6, r7, r10, r2
+; THUMBV7-NEXT:    adds r5, r5, r7
+; THUMBV7-NEXT:    adc r7, r1, #0
+; THUMBV7-NEXT:    adds.w r6, r6, r11
+; THUMBV7-NEXT:    orr.w r4, r4, r7
+; THUMBV7-NEXT:    mov.w r7, #0
+; THUMBV7-NEXT:    orr.w lr, lr, r4
+; THUMBV7-NEXT:    umull r11, r4, r2, r9
+; THUMBV7-NEXT:    adc.w r10, r8, r5
+; THUMBV7-NEXT:    umlal r4, r7, r3, r9
+; THUMBV7-NEXT:    umull r2, r5, r2, r12
+; THUMBV7-NEXT:    adds.w r8, r2, r4
+; THUMBV7-NEXT:    adcs.w r2, r7, r5
+; THUMBV7-NEXT:    adc r4, r1, #0
+; THUMBV7-NEXT:    umlal r2, r4, r3, r12
+; THUMBV7-NEXT:    adds r2, r2, r6
+; THUMBV7-NEXT:    adcs.w r3, r4, r10
+; THUMBV7-NEXT:    adc r1, r1, #0
+; THUMBV7-NEXT:    orr.w r1, r1, lr
+; THUMBV7-NEXT:    b .LBB0_8
+; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV7-NEXT:    orrs.w r1, r10, r5
+; THUMBV7-NEXT:    beq.w .LBB0_7
+; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; THUMBV7-NEXT:    umull r1, lr, r2, r10
+; THUMBV7-NEXT:    movs r7, #0
+; THUMBV7-NEXT:    umlal lr, r7, r3, r10
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r4, r8, r2, r5
+; THUMBV7-NEXT:    adds.w r1, r4, lr
+; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r7, r8
+; THUMBV7-NEXT:    mov.w r1, #0
+; THUMBV7-NEXT:    adc lr, r1, #0
+; THUMBV7-NEXT:    umull r8, r1, r10, r11
+; THUMBV7-NEXT:    mla r1, r10, r6, r1
+; THUMBV7-NEXT:    umlal r7, lr, r3, r5
+; THUMBV7-NEXT:    mla r1, r5, r11, r1
+; THUMBV7-NEXT:    adds.w r5, r7, r8
+; THUMBV7-NEXT:    umull r4, r7, r2, r9
+; THUMBV7-NEXT:    adc.w r10, lr, r1
+; THUMBV7-NEXT:    movs r1, #0
+; THUMBV7-NEXT:    umlal r7, r1, r3, r9
+; THUMBV7-NEXT:    umull r2, lr, r2, r12
+; THUMBV7-NEXT:    adds.w r8, r2, r7
+; THUMBV7-NEXT:    mov.w r2, #0
+; THUMBV7-NEXT:    adcs.w r1, r1, lr
+; THUMBV7-NEXT:    adc r2, r2, #0
+; THUMBV7-NEXT:    umlal r1, r2, r3, r12
+; THUMBV7-NEXT:    umull r3, r7, r9, r11
+; THUMBV7-NEXT:    mla r7, r9, r6, r7
+; THUMBV7-NEXT:    adds r1, r1, r3
+; THUMBV7-NEXT:    mla r7, r12, r11, r7
+; THUMBV7-NEXT:    mov r11, r4
+; THUMBV7-NEXT:    adc.w r3, r2, r7
+; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    adds r2, r2, r1
+; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r3, r1
+; THUMBV7-NEXT:    adcs r1, r5, #0
+; THUMBV7-NEXT:    adc r7, r10, #0
+; THUMBV7-NEXT:    b .LBB0_6
+; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV7-NEXT:    umull r1, r4, r9, r11
+; THUMBV7-NEXT:    movs r7, #0
+; THUMBV7-NEXT:    mov.w r8, #0
+; THUMBV7-NEXT:    umlal r4, r7, r12, r11
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r1, lr, r9, r6
+; THUMBV7-NEXT:    adds r1, r1, r4
+; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r7, lr
+; THUMBV7-NEXT:    umull lr, r1, r11, r10
+; THUMBV7-NEXT:    adc r4, r8, #0
+; THUMBV7-NEXT:    mla r1, r11, r5, r1
+; THUMBV7-NEXT:    umlal r7, r4, r12, r6
+; THUMBV7-NEXT:    mla r1, r6, r10, r1
+; THUMBV7-NEXT:    adds.w r7, r7, lr
+; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
+; THUMBV7-NEXT:    mov.w r7, #0
+; THUMBV7-NEXT:    adc.w r11, r4, r1
+; THUMBV7-NEXT:    umull lr, r4, r9, r2
+; THUMBV7-NEXT:    umlal r4, r7, r12, r2
+; THUMBV7-NEXT:    umull r1, r9, r9, r3
+; THUMBV7-NEXT:    adds.w r8, r1, r4
+; THUMBV7-NEXT:    mov.w r4, #0
+; THUMBV7-NEXT:    adcs.w r1, r7, r9
+; THUMBV7-NEXT:    umull r7, r6, r2, r10
+; THUMBV7-NEXT:    adc r4, r4, #0
+; THUMBV7-NEXT:    mla r2, r2, r5, r6
+; THUMBV7-NEXT:    umlal r1, r4, r12, r3
+; THUMBV7-NEXT:    mla r2, r3, r10, r2
+; THUMBV7-NEXT:    adds r1, r1, r7
+; THUMBV7-NEXT:    adc.w r3, r4, r2
+; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    adds r2, r2, r1
+; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r3, r1
+; THUMBV7-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r1, r1, #0
+; THUMBV7-NEXT:    adc r7, r11, #0
+; THUMBV7-NEXT:    mov r11, lr
+; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV7-NEXT:    orrs r1, r7
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r11, #1
-; THUMBV7-NEXT:    adds r2, r2, r5
-; THUMBV7-NEXT:    and.w r3, r3, lr
-; THUMBV7-NEXT:    str.w r2, [r9, #8]
-; THUMBV7-NEXT:    adcs r1, r7
-; THUMBV7-NEXT:    str.w r1, [r9, #12]
-; THUMBV7-NEXT:    orr.w r1, r3, r11
-; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV7-NEXT:    orr.w r1, r1, r4
-; THUMBV7-NEXT:    orr.w r1, r1, r2
-; THUMBV7-NEXT:    orr.w r0, r0, r1
-; THUMBV7-NEXT:    adc r1, r12, #0
-; THUMBV7-NEXT:    orrs r0, r1
-; THUMBV7-NEXT:    and r0, r0, #1
-; THUMBV7-NEXT:    strb.w r0, [r9, #16]
-; THUMBV7-NEXT:    add sp, #44
+; THUMBV7-NEXT:    movne r1, #1
+; THUMBV7-NEXT:    b .LBB0_8
+; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV7-NEXT:    umull r1, lr, r2, r9
+; THUMBV7-NEXT:    movs r4, #0
+; THUMBV7-NEXT:    umlal lr, r4, r3, r9
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    movs r1, #0
+; THUMBV7-NEXT:    umull r7, r8, r2, r12
+; THUMBV7-NEXT:    adds.w r7, r7, lr
+; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r4, r8
+; THUMBV7-NEXT:    ldr r4, [sp, #60]
+; THUMBV7-NEXT:    adc r8, r1, #0
+; THUMBV7-NEXT:    umlal r7, r8, r3, r12
+; THUMBV7-NEXT:    umull r12, lr, r9, r11
+; THUMBV7-NEXT:    mla r6, r9, r6, lr
+; THUMBV7-NEXT:    str.w r12, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    mla r12, r4, r11, r6
+; THUMBV7-NEXT:    ldr.w r11, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    umull lr, r6, r10, r2
+; THUMBV7-NEXT:    mla r3, r10, r3, r6
+; THUMBV7-NEXT:    mla r2, r5, r2, r3
+; THUMBV7-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adds.w r3, r3, lr
+; THUMBV7-NEXT:    adc.w r6, r2, r12
+; THUMBV7-NEXT:    adds r2, r7, r3
+; THUMBV7-NEXT:    adc.w r3, r8, r6
+; THUMBV7-NEXT:    ldr.w r8, [sp] @ 4-byte Reload
+; THUMBV7-NEXT:  .LBB0_8: @ %overflow.res
+; THUMBV7-NEXT:    strd r11, r8, [r0]
+; THUMBV7-NEXT:    and r1, r1, #1
+; THUMBV7-NEXT:    strd r2, r3, [r0, #8]
+; THUMBV7-NEXT:    strb r1, [r0, #16]
+; THUMBV7-NEXT:    add sp, #12
 ; THUMBV7-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
index 55e917159fce9..997868766d1dd 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
@@ -3,15 +3,19 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: mulodi_test:
-; THUMBV7:       @ %bb.0: @ %start
+; THUMBV7:       @ %bb.0: @ %overflow.entry
 ; THUMBV7-NEXT:    .save {r4, r5, r7, lr}
 ; THUMBV7-NEXT:    push {r4, r5, r7, lr}
-; THUMBV7-NEXT:    umull r12, lr, r3, r0
+; THUMBV7-NEXT:    cbz r1, .LBB0_3
+; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; THUMBV7-NEXT:    cbz r3, .LBB0_5
+; THUMBV7-NEXT:  @ %bb.2: @ %overflow
+; THUMBV7-NEXT:    umull lr, r4, r3, r0
 ; THUMBV7-NEXT:    cmp r3, #0
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r3, #1
 ; THUMBV7-NEXT:    cmp r1, #0
-; THUMBV7-NEXT:    umull r0, r4, r0, r2
+; THUMBV7-NEXT:    umull r0, r12, r0, r2
 ; THUMBV7-NEXT:    umull r2, r5, r1, r2
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
@@ -20,15 +24,44 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r5, #1
 ; THUMBV7-NEXT:    orrs r1, r5
-; THUMBV7-NEXT:    cmp.w lr, #0
+; THUMBV7-NEXT:    cmp r4, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w lr, #1
-; THUMBV7-NEXT:    orr.w r3, r1, lr
-; THUMBV7-NEXT:    add.w r1, r2, r12
+; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    orr.w r3, r1, r4
+; THUMBV7-NEXT:    add.w r1, r2, lr
 ; THUMBV7-NEXT:    movs r2, #0
-; THUMBV7-NEXT:    adds r1, r1, r4
+; THUMBV7-NEXT:    adds.w r1, r1, r12
 ; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    orrs r2, r3
+; THUMBV7-NEXT:    orr.w r12, r3, r2
+; THUMBV7-NEXT:    and r2, r12, #1
+; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
+; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV7-NEXT:    mov r5, r0
+; THUMBV7-NEXT:    umull r0, r4, r0, r2
+; THUMBV7-NEXT:    cbz r3, .LBB0_7
+; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; THUMBV7-NEXT:    mul r12, r1, r3
+; THUMBV7-NEXT:    mla r1, r1, r2, r4
+; THUMBV7-NEXT:    umlal r1, r12, r5, r3
+; THUMBV7-NEXT:    b .LBB0_6
+; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV7-NEXT:    mov lr, r0
+; THUMBV7-NEXT:    umull r0, r4, r2, r0
+; THUMBV7-NEXT:    mov r5, r1
+; THUMBV7-NEXT:    mul r12, r3, r1
+; THUMBV7-NEXT:    mla r1, r3, lr, r4
+; THUMBV7-NEXT:    umlal r1, r12, r2, r5
+; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV7-NEXT:    cmp.w r12, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne.w r12, #1
+; THUMBV7-NEXT:    and r2, r12, #1
+; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
+; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV7-NEXT:    mla r3, r5, r3, r4
+; THUMBV7-NEXT:    mov.w r12, #0
+; THUMBV7-NEXT:    mla r1, r1, r2, r3
+; THUMBV7-NEXT:    and r2, r12, #1
 ; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index e101c702e6409..2d236cce94c30 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -6,60 +6,181 @@
 ; This used to call muloti4, but that won't link with libgcc.
 define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
 ; CHECK-LABEL: x:
-; CHECK:       ## %bb.0: ## %entry
+; CHECK:       ## %bb.0: ## %overflow.entry
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movq %rdx, %r9
-; CHECK-NEXT:    movq %rsi, %r8
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rdi, %r8
+; CHECK-NEXT:    sarq $63, %r8
+; CHECK-NEXT:    cmpq %r8, %rsi
+; CHECK-NEXT:    je LBB0_5
+; CHECK-NEXT:  ## %bb.1: ## %overflow.lhs
+; CHECK-NEXT:    cmpq %rax, %rcx
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  ## %bb.7: ## %overflow1
 ; CHECK-NEXT:    movq %rsi, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
 ; CHECK-NEXT:    imulq %rdx, %rbx
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rdx
 ; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %r9
+; CHECK-NEXT:    mulq %rdx
 ; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    movq %rax, %r8
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    mulq %r10
+; CHECK-NEXT:    movq %rdx, %r10
 ; CHECK-NEXT:    movq %rax, %r11
-; CHECK-NEXT:    addq %r10, %r11
-; CHECK-NEXT:    adcq %rbx, %r9
-; CHECK-NEXT:    movq %r9, %rbx
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    adcq %rbx, %r10
+; CHECK-NEXT:    movq %r10, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
-; CHECK-NEXT:    movq %rcx, %r14
-; CHECK-NEXT:    sarq $63, %r14
-; CHECK-NEXT:    imulq %rdi, %r14
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    imulq %rax, %r14
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    movq %rdx, %r9
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    addq %r11, %rdi
-; CHECK-NEXT:    adcq %r14, %r10
-; CHECK-NEXT:    movq %r10, %r11
+; CHECK-NEXT:    adcq %r14, %r9
+; CHECK-NEXT:    movq %r9, %r11
 ; CHECK-NEXT:    sarq $63, %r11
-; CHECK-NEXT:    addq %r9, %r10
+; CHECK-NEXT:    addq %r10, %r9
 ; CHECK-NEXT:    adcq %rbx, %r11
-; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    imulq %rcx
-; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    addq %r9, %rcx
 ; CHECK-NEXT:    adcq %r11, %rdx
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    sarq $63, %rcx
-; CHECK-NEXT:    xorq %rcx, %rdx
-; CHECK-NEXT:    xorq %rax, %rcx
-; CHECK-NEXT:    orq %rdx, %rcx
-; CHECK-NEXT:    jne LBB0_1
-; CHECK-NEXT:  ## %bb.2: ## %nooverflow
+; CHECK-NEXT:    movq %rdi, %rsi
+; CHECK-NEXT:    sarq $63, %rdi
+; CHECK-NEXT:    xorq %rdi, %rdx
+; CHECK-NEXT:    xorq %rcx, %rdi
+; CHECK-NEXT:    orq %rdx, %rdi
+; CHECK-NEXT:    jmp LBB0_8
+; CHECK-NEXT:  LBB0_5: ## %overflow.no.lhs
+; CHECK-NEXT:    cmpq %rax, %rcx
+; CHECK-NEXT:    je LBB0_6
+; CHECK-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rsi, %r9
+; CHECK-NEXT:    xorq %rax, %r9
+; CHECK-NEXT:    movq %rdi, %r8
+; CHECK-NEXT:    xorq %rax, %r8
+; CHECK-NEXT:    subq %rax, %r8
+; CHECK-NEXT:    sbbq %rax, %r9
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    sets %r10b
+; CHECK-NEXT:    cmovnsq %rsi, %r9
+; CHECK-NEXT:    cmovnsq %rdi, %r8
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rcx, %rsi
+; CHECK-NEXT:    xorq %rax, %rsi
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    xorq %rax, %rdi
+; CHECK-NEXT:    subq %rax, %rdi
+; CHECK-NEXT:    sbbq %rax, %rsi
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    sets %r11b
+; CHECK-NEXT:    cmovnsq %rcx, %rsi
+; CHECK-NEXT:    cmovnsq %rdx, %rdi
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %rdi
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    imulq %r9, %rdi
+; CHECK-NEXT:    addq %rdx, %rdi
+; CHECK-NEXT:    imulq %rsi, %r9
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    adcq %r9, %rdx
+; CHECK-NEXT:    xorb %r10b, %r11b
+; CHECK-NEXT:    movzbl %r11b, %ecx
+; CHECK-NEXT:    jmp LBB0_3
+; CHECK-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    xorq %rax, %r9
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    xorq %rax, %r8
+; CHECK-NEXT:    subq %rax, %r8
+; CHECK-NEXT:    sbbq %rax, %r9
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    sets %r10b
+; CHECK-NEXT:    cmovnsq %rcx, %r9
+; CHECK-NEXT:    cmovnsq %rdx, %r8
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rsi, %r14
+; CHECK-NEXT:    xorq %rax, %r14
+; CHECK-NEXT:    movq %rdi, %r11
+; CHECK-NEXT:    xorq %rax, %r11
+; CHECK-NEXT:    subq %rax, %r11
+; CHECK-NEXT:    sbbq %rax, %r14
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    sets %bl
+; CHECK-NEXT:    cmovnsq %rsi, %r14
+; CHECK-NEXT:    cmovnsq %rdi, %r11
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %r11
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    imulq %r9, %r11
+; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    imulq %r14, %r9
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %r14
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    addq %r11, %rsi
+; CHECK-NEXT:    adcq %r9, %rdx
+; CHECK-NEXT:    xorb %r10b, %bl
+; CHECK-NEXT:    movzbl %bl, %ecx
+; CHECK-NEXT:  LBB0_3: ## %overflow.res
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    xorq %rdi, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    setb %r8b
+; CHECK-NEXT:    xorq %rdi, %rsi
+; CHECK-NEXT:    addq %r8, %rsi
+; CHECK-NEXT:    xorq %rdx, %rdi
+; CHECK-NEXT:    cmpq %r8, %rsi
+; CHECK-NEXT:    adcq $0, %rdi
+; CHECK-NEXT:  LBB0_8: ## %overflow.res
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne LBB0_10
+; CHECK-NEXT:  LBB0_11: ## %nooverflow
+; CHECK-NEXT:    movq %rsi, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB0_1: ## %overflow
+; CHECK-NEXT:  LBB0_6: ## %overflow.no
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    mulq %rdx
+; CHECK-NEXT:    imulq %rcx, %rdi
+; CHECK-NEXT:    addq %rdx, %rdi
+; CHECK-NEXT:    imulq %r8, %rsi
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    je LBB0_11
+; CHECK-NEXT:  LBB0_10: ## %overflow
 ; CHECK-NEXT:    ud2
 entry:
   %tmp16 = zext i64 %a.coerce0 to i128
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 13596e1b18768..1460a2564cc3e 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -4,64 +4,185 @@
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-LABEL: smuloi128:
-; X64:       ## %bb.0:
-; X64-NEXT:    pushq %r15
+; X64:       ## %bb.0: ## %overflow.entry
+; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r15
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
-; X64-NEXT:    .cfi_offset %r14, -24
-; X64-NEXT:    .cfi_offset %r15, -16
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r14, -32
+; X64-NEXT:    .cfi_offset %r15, -24
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    sarq $63, %r9
+; X64-NEXT:    cmpq %r9, %rsi
+; X64-NEXT:    je LBB0_5
+; X64-NEXT:  ## %bb.1: ## %overflow.lhs
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    je LBB0_2
+; X64-NEXT:  ## %bb.7: ## %overflow
 ; X64-NEXT:    movq %rsi, %r14
 ; X64-NEXT:    sarq $63, %r14
 ; X64-NEXT:    imulq %rdx, %r14
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    adcq %r14, %r10
-; X64-NEXT:    movq %r10, %r14
+; X64-NEXT:    addq %r10, %rbx
+; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    movq %r11, %r14
 ; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    movq %rcx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    imulq %rdi, %r15
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    imulq %rax, %r15
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rbx, %rdi
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    adcq %r15, %r10
+; X64-NEXT:    movq %r10, %rbx
 ; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    addq %r11, %r10
 ; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    imulq %rcx
-; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    addq %r10, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    movq %rdi, 8(%r8)
+; X64-NEXT:    movq %rdi, %rsi
 ; X64-NEXT:    sarq $63, %rdi
 ; X64-NEXT:    xorq %rdi, %rdx
 ; X64-NEXT:    xorq %rax, %rdi
 ; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    jmp LBB0_8
+; X64-NEXT:  LBB0_5: ## %overflow.no.lhs
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    je LBB0_6
+; X64-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    subq %rax, %r10
+; X64-NEXT:    sbbq %rax, %r11
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets %bl
+; X64-NEXT:    cmovnsq %rsi, %r11
+; X64-NEXT:    cmovnsq %rdi, %r10
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    sbbq %rax, %rsi
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    sets %bpl
+; X64-NEXT:    cmovnsq %rcx, %rsi
+; X64-NEXT:    cmovnsq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %r11, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %rsi, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    jmp LBB0_3
+; X64-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    subq %rax, %r10
+; X64-NEXT:    sbbq %rax, %r11
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    sets %bl
+; X64-NEXT:    cmovnsq %rcx, %r11
+; X64-NEXT:    cmovnsq %rdx, %r10
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    subq %rax, %rcx
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets %bpl
+; X64-NEXT:    cmovnsq %rsi, %r14
+; X64-NEXT:    cmovnsq %rdi, %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %r11, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    imulq %r14, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:  LBB0_3: ## %overflow.res
+; X64-NEXT:    adcq %r11, %rdx
+; X64-NEXT:    xorb %bl, %bpl
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    cmpq %rax, %r9
+; X64-NEXT:    setb %dil
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorq %rdx, %rcx
+; X64-NEXT:    cmpq %rdi, %rsi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:  LBB0_8: ## %overflow.res
 ; X64-NEXT:    setne %al
-; X64-NEXT:    movq %rsi, (%r8)
+; X64-NEXT:    jmp LBB0_9
+; X64-NEXT:  LBB0_6: ## %overflow.no
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    mulq %rdx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %r10, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:  LBB0_9: ## %overflow.res
+; X64-NEXT:    movq %r9, (%r8)
+; X64-NEXT:    movq %rsi, 8(%r8)
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: smuloi128:
-; X86:       ## %bb.0:
+; X86:       ## %bb.0: ## %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -70,196 +191,212 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 64
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 72
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    je LBB0_12
+; X86-NEXT:  ## %bb.1: ## %overflow.lhs
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je LBB0_2
+; X86-NEXT:  ## %bb.14: ## %overflow
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -268,38 +405,435 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ebx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    jmp LBB0_15
+; X86-NEXT:  LBB0_12: ## %overflow.no.lhs
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je LBB0_13
+; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_9
+; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:  LBB0_9: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_11
+; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:  LBB0_11: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    jmp LBB0_15
+; X86-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_4
+; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB0_4: ## %overflow.no.rhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_6
+; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  LBB0_6: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    cmpl %edx, %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:  LBB0_15: ## %overflow.res
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
-  %val = extractvalue {i128, i1} %t, 0
-  %obit = extractvalue {i128, i1} %t, 1
-  store i128 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
-; X64-LABEL: smuloi256:
-; X64:       ## %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 32
+; X86-NEXT:    jmp LBB0_16
+; X86-NEXT:  LBB0_13: ## %overflow.no
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  LBB0_16: ## %overflow.res
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, 4(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
+  %val = extractvalue {i128, i1} %t, 0
+  %obit = extractvalue {i128, i1} %t, 1
+  store i128 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
+; X64-LABEL: smuloi256:
+; X64:       ## %bb.0: ## %overflow.entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    .cfi_def_cfa_offset 40
 ; X64-NEXT:    pushq %r12
@@ -312,199 +846,558 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %rcx, %r12
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    movq %r11, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    xorq %r8, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    je LBB1_4
+; X64-NEXT:  ## %bb.1: ## %overflow.lhs
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    xorq %rbx, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    je LBB1_2
+; X64-NEXT:  ## %bb.6: ## %overflow
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:    addq %rdi, %r13
+; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq %rcx, %r8
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    imulq %rcx, %rsi
+; X64-NEXT:    movzbl %al, %edi
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    addq %rsi, %r15
-; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    adcq %r8, %r15
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    adcq %rdi, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    imulq %rdx, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdx
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    adcq %rsi, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rsi, %r12
-; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    addq %r8, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rdi, %rbx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    adcq %rsi, %rdi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %rbx
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %rbp
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    sarq $63, %r12
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %r8, %rsi
+; X64-NEXT:    setb %r9b
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    imulq %rsi, %r11
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    addq %r11, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    adcq %r13, %r9
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
+; X64-NEXT:    adcq %rbp, %r9
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    adcq %rax, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r15, %r11
+; X64-NEXT:    movq %r15, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    imulq %rcx, %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    movq %r12, %r15
+; X64-NEXT:    imulq %rsi, %r12
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r13, %rdi
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    addq %r12, %r11
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    addq %r14, %rbx
+; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rsi, %r13
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    adcq %r14, %r12
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movzbl %r14b, %r14d
+; X64-NEXT:    adcq %r14, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %r11, %rdx
+; X64-NEXT:    addq %r8, %rcx
+; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Folded Reload
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    sarq $63, %r8
+; X64-NEXT:    xorq %r8, %rax
+; X64-NEXT:    xorq %r8, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    xorq %r8, %rdx
+; X64-NEXT:    xorq %rsi, %r8
+; X64-NEXT:    orq %rdx, %r8
+; X64-NEXT:    orq %rcx, %r8
+; X64-NEXT:    jmp LBB1_7
+; X64-NEXT:  LBB1_4: ## %overflow.no.lhs
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    xorq %rbx, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    je LBB1_5
+; X64-NEXT:  ## %bb.3: ## %overflow.no.lhs.only
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rbp
+; X64-NEXT:    xorq %rsi, %rbp
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    movq %r11, %r13
+; X64-NEXT:    xorq %rsi, %r13
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    xorq %rsi, %r10
+; X64-NEXT:    subq %rsi, %r10
+; X64-NEXT:    sbbq %rsi, %r13
+; X64-NEXT:    sbbq %rsi, %rbp
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    testq %r12, %r12
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %r12, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    cmovnsq %r8, %rbp
+; X64-NEXT:    cmovnsq %r11, %r13
+; X64-NEXT:    cmovnsq %rdi, %r10
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    xorq %rdx, %r12
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    xorq %rdx, %r14
+; X64-NEXT:    xorq %rdx, %r9
+; X64-NEXT:    movq %r15, %r11
+; X64-NEXT:    xorq %rdx, %r11
+; X64-NEXT:    subq %rdx, %r11
+; X64-NEXT:    sbbq %rdx, %r9
+; X64-NEXT:    sbbq %rdx, %r14
+; X64-NEXT:    sbbq %rdx, %r12
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rbx, %r12
+; X64-NEXT:    cmovnsq %rax, %r14
+; X64-NEXT:    cmovnsq %rcx, %r9
+; X64-NEXT:    cmovnsq %r15, %r11
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r8d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r8, %rsi
+; X64-NEXT:    imulq %rbp, %r9
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    imulq %r15, %r11
+; X64-NEXT:    addq %rdx, %r11
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq %r11, %rdi
-; X64-NEXT:    movq %r11, %r8
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    imulq %rdi, %r10
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    adcq %r9, %r11
-; X64-NEXT:    addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT:    adcq %rbp, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r11, %rbp
-; X64-NEXT:    sarq $63, %rbp
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %r15, %r11
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ebx
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq %rbx, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    imulq %r15, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    imulq %rbp, %r12
+; X64-NEXT:    addq %r14, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r11, %r10
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r15
+; X64-NEXT:    xorq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    cmpq %rdx, %r14
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    xorq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    xorq %rcx, %r12
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    orq %rcx, %r12
+; X64-NEXT:    setne %al
+; X64-NEXT:    jmp LBB1_8
+; X64-NEXT:  LBB1_2: ## %overflow.no.rhs.only
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    xorq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    xorq %rdx, %r14
+; X64-NEXT:    movq %r9, %r13
+; X64-NEXT:    xorq %rdx, %r13
+; X64-NEXT:    movq %r15, %r10
+; X64-NEXT:    xorq %rdx, %r10
+; X64-NEXT:    subq %rdx, %r10
+; X64-NEXT:    sbbq %rdx, %r13
+; X64-NEXT:    sbbq %rdx, %r14
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rbx, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    cmovnsq %rax, %r14
+; X64-NEXT:    cmovnsq %r9, %r13
+; X64-NEXT:    cmovnsq %r15, %r10
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    adcq %rbp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %r12, %rbp
+; X64-NEXT:    xorq %rax, %rbp
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    xorq %rax, %r12
+; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    xorq %rax, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    sbbq %rax, %rbx
+; X64-NEXT:    sbbq %rax, %r12
+; X64-NEXT:    sbbq %rax, %rbp
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rsi, %rbp
+; X64-NEXT:    cmovnsq %r8, %r12
+; X64-NEXT:    cmovnsq %r11, %rbx
+; X64-NEXT:    cmovnsq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %rbp
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    imulq %rcx, %r8
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
-; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r9, %r8
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r9d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    imulq %r14, %rbx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload
+; X64-NEXT:    addq %rbx, %r9
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rbx, %r10
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r15d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq %r15, %rsi
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; X64-NEXT:    imulq %r14, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %rdx, %rbp
+; X64-NEXT:    addq %r12, %rbp
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %rsi, %rbp
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r9, %r10
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r15
+; X64-NEXT:    xorq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    cmpq %rdx, %r14
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    xorq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    xorq %rcx, %rbp
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    orq %rcx, %rbp
+; X64-NEXT:  LBB1_7: ## %overflow.res
+; X64-NEXT:    setne %al
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    jmp LBB1_8
+; X64-NEXT:  LBB1_5: ## %overflow.no
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r11, %rcx
 ; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    imulq %rdi, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r15, %r12
+; X64-NEXT:    addq %rdx, %r12
+; X64-NEXT:    imulq %r9, %r8
+; X64-NEXT:    addq %r12, %r8
+; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    adcq %rbx, %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    adcq %r10, %rsi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ebx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r15, %rdi
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    adcq %r11, %rdi
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    orq %rax, %rsi
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    xorq %rdi, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    orq %rsi, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %r8, 24(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, (%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 8(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 16(%rax)
-; X64-NEXT:    setne %al
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r8, %r10
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:  LBB1_8: ## %overflow.res
+; X64-NEXT:    movq %r14, (%r13)
+; X64-NEXT:    movq %r15, 8(%r13)
+; X64-NEXT:    movq %rdi, 16(%r13)
+; X64-NEXT:    movq %r10, 24(%r13)
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
@@ -514,7 +1407,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: smuloi256:
-; X86:       ## %bb.0:
+; X86:       ## %bb.0: ## %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -529,334 +1422,1687 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    je LBB1_12
+; X86-NEXT:  ## %bb.1: ## %overflow.lhs
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    je LBB1_2
+; X86-NEXT:  ## %bb.14: ## %overflow
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    imull %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    jmp LBB1_15
+; X86-NEXT:  LBB1_12: ## %overflow.no.lhs
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    je LBB1_13
+; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_9
+; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB1_9: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_11
+; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:  LBB1_11: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl (%esp), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %ebx, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    cmpl %esi, %ebp
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    jmp LBB1_15
+; X86-NEXT:  LBB1_2: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    js LBB1_4
+; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB1_4: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_6
+; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  LBB1_6: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
@@ -866,203 +3112,277 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:  LBB1_15: ## %overflow.res
+; X86-NEXT:    setne %al
+; X86-NEXT:    jmp LBB1_16
+; X86-NEXT:  LBB1_13: ## %overflow.no
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -1073,58 +3393,60 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -1133,15 +3455,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1150,136 +3472,151 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  LBB1_16: ## %overflow.res
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    setne %al
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %esi, 16(%ecx)
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 20(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 28(%ecx)
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-NEXT:    addl $128, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b980..4ccb90a37ca71 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -4,14 +4,19 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-LABEL: muloti_test:
-; X64:       # %bb.0: # %start
+; X64:       # %bb.0: # %overflow.entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    je .LBB0_3
+; X64-NEXT:  # %bb.1: # %overflow.lhs
 ; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    setne %dl
+; X64-NEXT:    je .LBB0_7
+; X64-NEXT:  # %bb.2: # %overflow
+; X64-NEXT:    setne %al
 ; X64-NEXT:    testq %rsi, %rsi
 ; X64-NEXT:    setne %r9b
-; X64-NEXT:    andb %dl, %r9b
+; X64-NEXT:    andb %al, %r9b
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    seto %r10b
@@ -26,10 +31,59 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_3: # %overflow.no.lhs
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    je .LBB0_8
+; X64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rsi, %r8
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    imulq %rcx, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %r8, %rdx
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    jmp .LBB0_5
+; X64-NEXT:  .LBB0_7: # %overflow.no.rhs.only
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %rsi, %rcx
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:  .LBB0_5: # %overflow.no.lhs.only
+; X64-NEXT:    setne %cl
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_8: # %overflow.no
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: muloti_test:
-; X86:       # %bb.0: # %start
+; X86:       # %bb.0: # %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -38,116 +92,352 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 56
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.1: # %overflow.lhs
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.6: # %overflow
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    leal (%ecx,%eax), %esi
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    leal (%esi,%eax), %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    leal (%ebx,%eax), %ebx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    setne %ah
+; X86-NEXT:    andb %al, %ah
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %ch
-; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    orb %ah, %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    setne %cl
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setne %ah
+; X86-NEXT:    andb %al, %ah
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    setne %ch
-; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    orb %ch, %bl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    setne %bh
 ; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setne %bl
+; X86-NEXT:    andb %ch, %bl
+; X86-NEXT:    orb %al, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    je .LBB0_5
+; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    setne %bl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    setne %bl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_5: # %overflow.no
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    andb %bh, %al
-; X86-NEXT:    orb %bl, %al
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:  .LBB0_7: # %overflow.res
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movb %bl, 16(%eax)
+; X86-NEXT:    addl $36, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 132683cdb0f9e..99dc422a6b53e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-LABEL: mulodi_test:
-; X86:       # %bb.0: # %start
+; X86:       # %bb.0: # %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -12,32 +12,89 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 24
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %dl
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.1: # %overflow.lhs
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.6: # %overflow
+; X86-NEXT:    setne %al
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    andb %al, %cl
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    seto %ch
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    seto %bl
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
-; X86-NEXT:    orb %cl, %ch
-; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb %bl, %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB0_5
+; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setne %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    imull %edi, %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setne %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_5: # %overflow.no
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:  .LBB0_7: # %overflow.res
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index a076d0d762aa3..2601b73f26822 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -13,7 +13,7 @@ define {i64, i1} @t1() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t1:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    movl $72, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -30,7 +30,7 @@ define {i64, i1} @t2() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    xorl %eax, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -47,7 +47,7 @@ define {i64, i1} @t3() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t3:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    movl $-9, %eax
 ; WIN32-NEXT:    movl $-1, %edx
 ; WIN32-NEXT:    movb $1, %cl
@@ -204,59 +204,207 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    sarl $31, %edx
 ; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    je LBB6_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB6_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    imull %ebx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    jmp LBB6_16
+; WIN32-NEXT:  LBB6_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB6_14
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB6_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB6_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB6_12
+; WIN32-NEXT:  LBB6_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:  LBB6_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %ebx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %edi
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB6_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %edi, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB6_12: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB6_16
+; WIN32-NEXT:  LBB6_14: # %overflow.no
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB6_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -449,37 +597,93 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB10_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB10_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB10_8
+; WIN32-NEXT:  LBB10_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB10_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB10_3
+; WIN32-NEXT:  LBB10_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB10_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB10_8
+; WIN32-NEXT:  LBB10_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB10_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -547,75 +751,224 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloselecti64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %ecx, %edx
+; WIN32-NEXT:    je LBB12_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    je LBB12_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    adcl %ecx, %ebx
 ; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ebp, %edi
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    adcl %ecx, %ebp
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    addl %ebx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull %ebx
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    adcl %esi, %edx
 ; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    xorl %edi, %edx
 ; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    orl %edx, %edi
-; WIN32-NEXT:    jne LBB12_2
-; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    je LBB12_17
+; WIN32-NEXT:    jmp LBB12_18
+; WIN32-NEXT:  LBB12_13: # %overflow.no.lhs
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    je LBB12_14
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    xorl %ecx, %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    xorl %ecx, %edi
+; WIN32-NEXT:    subl %ecx, %edi
+; WIN32-NEXT:    sbbl %ecx, %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_10
+; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:  LBB12_10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_12
+; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:  LBB12_12: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %esi
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    subl %esi, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %edx, %eax
+; WIN32-NEXT:    xorl %edi, %ecx
+; WIN32-NEXT:    subl %edx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    jmp LBB12_7
+; WIN32-NEXT:  LBB12_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:  LBB12_2:
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    addl $4, %esp
-; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
-; WIN32-NEXT:    retl
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @umuloselecti32(i32 %v1, i32 %v2) {
-; LINUX-LABEL: umuloselecti32:
-; LINUX:       # %bb.0:
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %esi
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:  LBB12_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:  LBB12_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %esi
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    subl %esi, %ebx
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %edx, %eax
+; WIN32-NEXT:    xorl %edi, %ecx
+; WIN32-NEXT:    subl %edx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:  LBB12_7: # %overflow.res
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    jne LBB12_18
+; WIN32-NEXT:  LBB12_17: # %overflow.res
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:  LBB12_18: # %overflow.res
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB12_14: # %overflow.no
+; WIN32-NEXT:    xorl %ecx, %ecx
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    je LBB12_17
+; WIN32-NEXT:    jmp LBB12_18
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umuloselecti32(i32 %v1, i32 %v2) {
+; LINUX-LABEL: umuloselecti32:
+; LINUX:       # %bb.0:
 ; LINUX-NEXT:    movl %edi, %eax
 ; LINUX-NEXT:    mull %esi
 ; LINUX-NEXT:    cmovol %edi, %esi
@@ -670,45 +1023,86 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloselecti64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    je LBB14_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB14_2
+; WIN32-NEXT:  # %bb.7: # %overflow
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %al, %cl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    orb %bl, %bh
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    addl %eax, %ebp
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    addl %ebp, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bh, %al
-; WIN32-NEXT:    testb %al, %al
-; WIN32-NEXT:    jne LBB14_2
-; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    orb %ch, %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB14_9
+; WIN32-NEXT:    jmp LBB14_10
+; WIN32-NEXT:  LBB14_5: # %overflow.no.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB14_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    imull %eax, %ebp
+; WIN32-NEXT:    movl %eax, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    jmp LBB14_3
+; WIN32-NEXT:  LBB14_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edi, %ebp
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:  LBB14_3: # %overflow.res
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB14_10
+; WIN32-NEXT:  LBB14_9: # %overflow.res
+; WIN32-NEXT:    movl %edi, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:  LBB14_2:
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:  LBB14_10: # %overflow.res
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    movl %esi, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
@@ -716,6 +1110,12 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB14_6: # %overflow.no
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB14_9
+; WIN32-NEXT:    jmp LBB14_10
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -952,35 +1352,47 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smulobri64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    sarl $31, %edx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    je LBB18_12
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB18_2
+; WIN32-NEXT:  # %bb.14: # %overflow1
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %ebx, %ebp
-; WIN32-NEXT:    adcl %ecx, %edi
+; WIN32-NEXT:    adcl %esi, %edi
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %edx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %esi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ecx
 ; WIN32-NEXT:    mull %edx
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
@@ -989,7 +1401,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
 ; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %ebx, %eax
@@ -998,19 +1410,148 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    xorl %esi, %edx
 ; WIN32-NEXT:    xorl %eax, %esi
 ; WIN32-NEXT:    orl %edx, %esi
-; WIN32-NEXT:    jne LBB18_1
-; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    jmp LBB18_15
+; WIN32-NEXT:  LBB18_12: # %overflow.no.lhs
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB18_13
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %edi, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:  LBB18_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:  LBB18_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %bl, %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    negl %esi
+; WIN32-NEXT:    xorl %esi, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    subl %edi, %ecx
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %esi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    xorl %edx, %esi
+; WIN32-NEXT:    subl %ebx, %eax
+; WIN32-NEXT:    adcl $0, %esi
+; WIN32-NEXT:    jmp LBB18_15
+; WIN32-NEXT:  LBB18_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    js LBB18_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %ebx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:  LBB18_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:  LBB18_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    xorl %edx, %ecx
+; WIN32-NEXT:    subl %ebx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:  LBB18_15: # %overflow.res
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB18_17
+; WIN32-NEXT:  LBB18_19: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB18_2: # %overflow
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:  LBB18_18: # %overflow
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB18_1: # %overflow
+; WIN32-NEXT:  LBB18_13: # %overflow.no
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB18_2
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB18_19
+; WIN32-NEXT:  LBB18_17: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    jmp LBB18_18
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1261,46 +1802,90 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umulobri64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB22_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB22_2
+; WIN32-NEXT:  # %bb.7: # %overflow1
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %dl
+; WIN32-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb %bl, %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    leal (%ebp,%eax), %edi
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %ch, %al
-; WIN32-NEXT:    subb $1, %al
-; WIN32-NEXT:    je LBB22_1
-; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB22_11
+; WIN32-NEXT:    jmp LBB22_9
+; WIN32-NEXT:  LBB22_5: # %overflow.no.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB22_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB22_3
+; WIN32-NEXT:  LBB22_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:  LBB22_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB22_9
+; WIN32-NEXT:  LBB22_11: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB22_2: # %overflow
+; WIN32-NEXT:  LBB22_10: # %overflow
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB22_1: # %overflow
+; WIN32-NEXT:  LBB22_6: # %overflow.no
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB22_11
+; WIN32-NEXT:  LBB22_9: # %overflow
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB22_2
+; WIN32-NEXT:    jmp LBB22_10
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1334,18 +1919,33 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: bug27873:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    movl $160, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl $160, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %edx
-; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bl, %al
-; WIN32-NEXT:    orb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    je LBB23_2
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl $160, %ebx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl $0, %edx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    jmp LBB23_3
+; WIN32-NEXT:  LBB23_2: # %overflow.no.lhs
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB23_3: # %overflow.res
+; WIN32-NEXT:    orb %al, %cl
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    retl
   %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
@@ -1635,62 +2235,208 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ecx
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ecx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    sarl $31, %edx
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    je LBB30_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB30_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    imull %ebx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    jmp LBB30_16
+; WIN32-NEXT:  LBB30_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB30_14
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB30_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB30_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB30_12
+; WIN32-NEXT:  LBB30_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB30_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB30_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edi, %ebp
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB30_12: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB30_16
+; WIN32-NEXT:  LBB30_14: # %overflow.no
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB30_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1728,61 +2474,206 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebx
-; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%edx), %ebx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl 4(%edx), %ebp
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    je LBB31_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB31_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl 4(%eax), %ecx
-; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebp, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull (%esp) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    addl %ebx, %edi
+; WIN32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    imull %ebp
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    jmp LBB31_16
+; WIN32-NEXT:  LBB31_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB31_14
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %esi
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_10
+; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB31_10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_12
+; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:  LBB31_12: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %esi, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB31_7
+; WIN32-NEXT:  LBB31_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    xorl %eax, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    sbbl %eax, %edi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:  LBB31_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB31_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB31_7: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB31_16
+; WIN32-NEXT:  LBB31_14: # %overflow.no
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    imull %edi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB31_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
 ; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -2133,38 +3024,94 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebp
-; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB38_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB38_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB38_8
+; WIN32-NEXT:  LBB38_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB38_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB38_3
+; WIN32-NEXT:  LBB38_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB38_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB38_8
+; WIN32-NEXT:  LBB38_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB38_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2210,38 +3157,94 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebp
-; WIN32-NEXT:    movl 4(%ecx), %esi
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB39_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB39_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB39_8
+; WIN32-NEXT:  LBB39_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB39_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB39_3
+; WIN32-NEXT:  LBB39_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB39_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB39_8
+; WIN32-NEXT:  LBB39_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB39_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx