[llvm] [profcheck] Fix profle metatdata propagation for Large Intger operations (PR #175862)

Jin Huang via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 16 16:20:14 PST 2026


https://github.com/jinhuang1102 updated https://github.com/llvm/llvm-project/pull/175862

>From 22f8cae7088d8b7def3f933c90a741e106d62fcc Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Fri, 16 Jan 2026 14:17:50 -0800
Subject: [PATCH] [lldb] Fix llvm_unreachable for invalid Wasm address
 (#176464)

We had an llvm_unreachable following a switch on the WasmAddress's type.
However, the type is encoded in a larger 64 bit address, and therefore
it's possible to create an invalid value that doesn't map back on one of
the enum types.

We could try to diagnose that in the wrapper, or treat all invalid types
the same. I took the latter approach because it makes it easier to show
the invalid type after the fact in an error message.

rdar://168314695
---
 llvm/include/llvm/IR/ProfDataUtils.h          |    7 +
 llvm/lib/IR/ProfDataUtils.cpp                 |   14 +
 llvm/lib/Transforms/Utils/IntegerDivision.cpp |   55 +-
 llvm/test/CodeGen/RISCV/idiv_large.ll         | 1061 ++++++++---------
 .../X86/div-rem-pair-recomposition-signed.ll  |  463 +++----
 .../div-rem-pair-recomposition-unsigned.ll    |  479 ++++----
 llvm/test/CodeGen/X86/pr38539.ll              |  187 +--
 .../Transforms/ExpandIRInsts/X86/sdiv129.ll   |   26 +-
 .../Transforms/ExpandIRInsts/X86/srem129.ll   |   26 +-
 .../Transforms/ExpandIRInsts/X86/udiv129.ll   |   26 +-
 .../Transforms/ExpandIRInsts/X86/urem129.ll   |   26 +-
 .../Transforms/ExpandIRInsts/X86/vector.ll    |   88 +-
 12 files changed, 1299 insertions(+), 1159 deletions(-)

diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index c8cfccbe61e90..9cca8a4d2297f 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_IR_PROFDATAUTILS_H
 #define LLVM_IR_PROFDATAUTILS_H
 
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Compiler.h"
@@ -216,6 +217,12 @@ LLVM_ABI bool hasExplicitlyUnknownBranchWeights(const Instruction &I);
 /// Scaling the profile data attached to 'I' using the ratio of S/T.
 LLVM_ABI void scaleProfData(Instruction &I, uint64_t S, uint64_t T);
 
+// Helper to apply a metadata setting function to an Instruction* if profiling
+// is enabled. If profiling is disabled (ProfcheckDisableMetadataFixes is true)
+// or V is not an Instruction, the callback will not be invoked.
+LLVM_ABI void applyProfMetadataIfEnabled(
+    Value *V, llvm::function_ref<void(Instruction *)> setMetadataCallback);
+
 /// Get the branch weights of a branch conditioned on b1 || b2, where b1 and b2
 /// are 2 booleans that are the conditions of 2 branches for which we have the
 /// branch weights B1 and B2, respectively. In both B1 and B2, the first
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 1d6a7df5e91da..15cb7f9b2927f 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/ProfDataUtils.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -24,6 +25,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
 // MD_prof nodes have the following layout
 //
 // In general:
@@ -391,3 +396,12 @@ void llvm::scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
     }
   I.setMetadata(LLVMContext::MD_prof, MDNode::get(C, Vals));
 }
+
+void llvm::applyProfMetadataIfEnabled(
+    Value *V, llvm::function_ref<void(Instruction *)> setMetadataCallback) {
+  if (!ProfcheckDisableMetadataFixes) {
+    if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+      setMetadataCallback(Inst);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
index e95a7a9ae525a..cc271089e3778 100644
--- a/llvm/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -16,8 +16,14 @@
 #include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -235,11 +241,37 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
   Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True});
   Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
   Value *Ret0_4      = Builder.CreateICmpUGT(SR, MSB);
+
+  // Add 'unlikely' branch weights. We mark the case where either the divisor
+  // or the dividend is equal to zero as unlikely.
   Value *Ret0        = Builder.CreateLogicalOr(Ret0_3, Ret0_4);
+  applyProfMetadataIfEnabled(Ret0, [&](Instruction *Inst) {
+    Inst->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(Inst->getContext()).createUnlikelyBranchWeights());
+  });
   Value *RetDividend = Builder.CreateICmpEQ(SR, MSB);
+
+  // Conservatively, we treat the case |divisor| > |dividend| as unknown
   Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  applyProfMetadataIfEnabled(RetVal, [&](Instruction * Inst){
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Inst, DEBUG_TYPE, F);
+  });
   Value *EarlyRet    = Builder.CreateLogicalOr(Ret0, RetDividend);
-  Builder.CreateCondBr(EarlyRet, End, BB1);
+  applyProfMetadataIfEnabled(EarlyRet, [&](Instruction *Inst) {
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Inst, DEBUG_TYPE, F);
+  });
+
+  // The condition of this branch is based on `EarlyRet`. `EarlyRet` is true
+  // only for special cases like dividend or divisor being zero, or the divisor
+  // being greater than the dividend. Thus, the branch to `End` is unlikely,
+  // and we expect to more frequently enter `BB1`.
+  Value *ConBrSpecialCases = Builder.CreateCondBr(EarlyRet, End, BB1);
+  applyProfMetadataIfEnabled(ConBrSpecialCases, [&](Instruction *Inst){
+    Inst->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(Inst->getContext()).createUnlikelyBranchWeights());
+  });
 
   // ; bb1:                                             ; preds = %special-cases
   // ;   %sr_1     = add i32 %sr, 1
@@ -251,8 +283,17 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
   Value *SR_1     = Builder.CreateAdd(SR, One);
   Value *Tmp2     = Builder.CreateSub(MSB, SR);
   Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  // We assume that in the common case, the dividend's magnitude is larger than
+  // the divisor's magnitude such that the loop counter (SR) is non-zero.
+  // Specifically, if |dividend| >= 2 * |divisor|, then SR >= 1, ensuring SR_1
+  // >= 2. The case where SR_1 == 0 is thus considered unlikely.
   Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
-  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+  Value *ConBrBB1 = Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+  applyProfMetadataIfEnabled(ConBrBB1, [&](Instruction *Inst){
+    Inst->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(Inst->getContext()).createUnlikelyBranchWeights());
+  });
 
   // ; preheader:                                           ; preds = %bb1
   // ;   %tmp3 = lshr i32 %dividend, %sr_1
@@ -298,7 +339,15 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
   Value *R     = Builder.CreateSub(Tmp7, Tmp11);
   Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
   Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
-  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+  // The loop implements the core bit-by-bit binary long division algorithm.
+  // The branch is unlikely to exit the loop early until it has processed all
+  // significant bits.
+  Value *ConBrDoWhile = Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+  applyProfMetadataIfEnabled(ConBrDoWhile, [&](Instruction *Inst){
+    Inst->setMetadata(
+        LLVMContext::MD_prof,
+        MDBuilder(Inst->getContext()).createUnlikelyBranchWeights());
+  });
 
   // ; loop-exit:                                      ; preds = %do-while, %bb1
   // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index 1d13f723ac224..2ad605bc3ff9e 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -471,7 +471,7 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    sw s9, 100(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s10, 96(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s11, 92(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a3, a0
 ; RV32-NEXT:    lw ra, 0(a2)
 ; RV32-NEXT:    lw a5, 4(a2)
 ; RV32-NEXT:    lw s9, 8(a2)
@@ -480,8 +480,8 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    addi t4, t4, 1365
 ; RV32-NEXT:    lui t3, 209715
 ; RV32-NEXT:    addi t3, t3, 819
-; RV32-NEXT:    lui a7, 61681
-; RV32-NEXT:    addi a7, a7, -241
+; RV32-NEXT:    lui t2, 61681
+; RV32-NEXT:    addi t2, t2, -241
 ; RV32-NEXT:    bnez a5, .LBB2_2
 ; RV32-NEXT:  # %bb.1: # %_udiv-special-cases
 ; RV32-NEXT:    srli a0, ra, 1
@@ -504,7 +504,7 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, a6, a0
 ; RV32-NEXT:    srli a6, a0, 4
 ; RV32-NEXT:    add a0, a0, a6
-; RV32-NEXT:    and a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
 ; RV32-NEXT:    slli a6, a0, 8
 ; RV32-NEXT:    add a0, a0, a6
 ; RV32-NEXT:    slli a6, a0, 16
@@ -533,7 +533,7 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, a6, a0
 ; RV32-NEXT:    srli a6, a0, 4
 ; RV32-NEXT:    add a0, a0, a6
-; RV32-NEXT:    and a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
 ; RV32-NEXT:    slli a6, a0, 8
 ; RV32-NEXT:    add a0, a0, a6
 ; RV32-NEXT:    slli a6, a0, 16
@@ -546,73 +546,73 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
 ; RV32-NEXT:    srli a0, s9, 1
 ; RV32-NEXT:    or a0, s9, a0
-; RV32-NEXT:    srli t0, a0, 2
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 4
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 8
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 16
-; RV32-NEXT:    or a0, a0, t0
+; RV32-NEXT:    srli a7, a0, 2
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 4
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 8
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 16
+; RV32-NEXT:    or a0, a0, a7
 ; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    srli t0, a0, 1
-; RV32-NEXT:    and t0, t0, t4
-; RV32-NEXT:    sub a0, a0, t0
-; RV32-NEXT:    and t0, a0, t3
+; RV32-NEXT:    srli a7, a0, 1
+; RV32-NEXT:    and a7, a7, t4
+; RV32-NEXT:    sub a0, a0, a7
+; RV32-NEXT:    and a7, a0, t3
 ; RV32-NEXT:    srli a0, a0, 2
 ; RV32-NEXT:    and a0, a0, t3
-; RV32-NEXT:    add a0, t0, a0
-; RV32-NEXT:    srli t0, a0, 4
-; RV32-NEXT:    add a0, a0, t0
-; RV32-NEXT:    and a0, a0, a7
-; RV32-NEXT:    slli t0, a0, 8
-; RV32-NEXT:    add a0, a0, t0
-; RV32-NEXT:    slli t0, a0, 16
-; RV32-NEXT:    add a0, a0, t0
+; RV32-NEXT:    add a0, a7, a0
+; RV32-NEXT:    srli a7, a0, 4
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a7, a0, 8
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    slli a7, a0, 16
+; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    srli a0, a0, 24
 ; RV32-NEXT:    addi t5, a0, 32
 ; RV32-NEXT:    j .LBB2_6
 ; RV32-NEXT:  .LBB2_5:
 ; RV32-NEXT:    srli a0, s10, 1
 ; RV32-NEXT:    or a0, s10, a0
-; RV32-NEXT:    srli t0, a0, 2
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 4
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 8
-; RV32-NEXT:    or a0, a0, t0
-; RV32-NEXT:    srli t0, a0, 16
-; RV32-NEXT:    or a0, a0, t0
+; RV32-NEXT:    srli a7, a0, 2
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 4
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 8
+; RV32-NEXT:    or a0, a0, a7
+; RV32-NEXT:    srli a7, a0, 16
+; RV32-NEXT:    or a0, a0, a7
 ; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    srli t0, a0, 1
-; RV32-NEXT:    and t0, t0, t4
-; RV32-NEXT:    sub a0, a0, t0
-; RV32-NEXT:    and t0, a0, t3
+; RV32-NEXT:    srli a7, a0, 1
+; RV32-NEXT:    and a7, a7, t4
+; RV32-NEXT:    sub a0, a0, a7
+; RV32-NEXT:    and a7, a0, t3
 ; RV32-NEXT:    srli a0, a0, 2
 ; RV32-NEXT:    and a0, a0, t3
-; RV32-NEXT:    add a0, t0, a0
-; RV32-NEXT:    srli t0, a0, 4
-; RV32-NEXT:    add a0, a0, t0
-; RV32-NEXT:    and a0, a0, a7
-; RV32-NEXT:    slli t0, a0, 8
-; RV32-NEXT:    add a0, a0, t0
-; RV32-NEXT:    slli t0, a0, 16
-; RV32-NEXT:    add a0, a0, t0
+; RV32-NEXT:    add a0, a7, a0
+; RV32-NEXT:    srli a7, a0, 4
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli a7, a0, 8
+; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    slli a7, a0, 16
+; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    srli t5, a0, 24
 ; RV32-NEXT:  .LBB2_6: # %_udiv-special-cases
-; RV32-NEXT:    lw t0, 12(a1)
+; RV32-NEXT:    lw a7, 12(a1)
 ; RV32-NEXT:    addi s0, t6, 64
 ; RV32-NEXT:    bnez s1, .LBB2_8
 ; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
 ; RV32-NEXT:    mv t5, s0
 ; RV32-NEXT:  .LBB2_8: # %_udiv-special-cases
-; RV32-NEXT:    lw t2, 0(a1)
-; RV32-NEXT:    lw t1, 8(a1)
+; RV32-NEXT:    lw t1, 0(a1)
+; RV32-NEXT:    lw t0, 8(a1)
 ; RV32-NEXT:    snez a1, s1
 ; RV32-NEXT:    bnez a6, .LBB2_10
 ; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
-; RV32-NEXT:    srli a0, t2, 1
-; RV32-NEXT:    or a0, t2, a0
+; RV32-NEXT:    srli a0, t1, 1
+; RV32-NEXT:    or a0, t1, a0
 ; RV32-NEXT:    srli s1, a0, 2
 ; RV32-NEXT:    or a0, a0, s1
 ; RV32-NEXT:    srli s1, a0, 4
@@ -631,7 +631,7 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, s1, a0
 ; RV32-NEXT:    srli s1, a0, 4
 ; RV32-NEXT:    add a0, a0, s1
-; RV32-NEXT:    and a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
 ; RV32-NEXT:    slli s1, a0, 8
 ; RV32-NEXT:    add a0, a0, s1
 ; RV32-NEXT:    slli s1, a0, 16
@@ -660,7 +660,7 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, s1, a0
 ; RV32-NEXT:    srli s1, a0, 4
 ; RV32-NEXT:    add a0, a0, s1
-; RV32-NEXT:    and a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
 ; RV32-NEXT:    slli s1, a0, 8
 ; RV32-NEXT:    add a0, a0, s1
 ; RV32-NEXT:    slli s1, a0, 16
@@ -669,19 +669,19 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:  .LBB2_11: # %_udiv-special-cases
 ; RV32-NEXT:    or s1, a5, s10
 ; RV32-NEXT:    or s2, ra, s9
-; RV32-NEXT:    or s3, a6, t0
-; RV32-NEXT:    or s4, t2, t1
+; RV32-NEXT:    or s3, a6, a7
+; RV32-NEXT:    or s4, t1, t0
 ; RV32-NEXT:    sltu t6, s0, t6
 ; RV32-NEXT:    addi s0, a1, -1
 ; RV32-NEXT:    addi a1, a0, 64
-; RV32-NEXT:    or s5, t1, t0
+; RV32-NEXT:    or s5, t0, a7
 ; RV32-NEXT:    sltu s6, a1, a0
 ; RV32-NEXT:    snez s7, s5
 ; RV32-NEXT:    addi s7, s7, -1
-; RV32-NEXT:    bnez t0, .LBB2_13
+; RV32-NEXT:    bnez a7, .LBB2_13
 ; RV32-NEXT:  # %bb.12: # %_udiv-special-cases
-; RV32-NEXT:    srli a0, t1, 1
-; RV32-NEXT:    or a0, t1, a0
+; RV32-NEXT:    srli a0, t0, 1
+; RV32-NEXT:    or a0, t0, a0
 ; RV32-NEXT:    srli s8, a0, 2
 ; RV32-NEXT:    or a0, a0, s8
 ; RV32-NEXT:    srli s8, a0, 4
@@ -700,17 +700,17 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, t4, a0
 ; RV32-NEXT:    srli t3, a0, 4
 ; RV32-NEXT:    add a0, a0, t3
-; RV32-NEXT:    and a0, a0, a7
-; RV32-NEXT:    slli a7, a0, 8
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    slli a7, a0, 16
-; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli t2, a0, 8
+; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    slli t2, a0, 16
+; RV32-NEXT:    add a0, a0, t2
 ; RV32-NEXT:    srli a0, a0, 24
 ; RV32-NEXT:    addi a0, a0, 32
 ; RV32-NEXT:    j .LBB2_14
 ; RV32-NEXT:  .LBB2_13:
-; RV32-NEXT:    srli a0, t0, 1
-; RV32-NEXT:    or a0, t0, a0
+; RV32-NEXT:    srli a0, a7, 1
+; RV32-NEXT:    or a0, a7, a0
 ; RV32-NEXT:    srli s8, a0, 2
 ; RV32-NEXT:    or a0, a0, s8
 ; RV32-NEXT:    srli s8, a0, 4
@@ -729,16 +729,16 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    add a0, t4, a0
 ; RV32-NEXT:    srli t3, a0, 4
 ; RV32-NEXT:    add a0, a0, t3
-; RV32-NEXT:    and a0, a0, a7
-; RV32-NEXT:    slli a7, a0, 8
-; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    slli a7, a0, 16
-; RV32-NEXT:    add a0, a0, a7
+; RV32-NEXT:    and a0, a0, t2
+; RV32-NEXT:    slli t2, a0, 8
+; RV32-NEXT:    add a0, a0, t2
+; RV32-NEXT:    slli t2, a0, 16
+; RV32-NEXT:    add a0, a0, t2
 ; RV32-NEXT:    srli a0, a0, 24
 ; RV32-NEXT:  .LBB2_14: # %_udiv-special-cases
 ; RV32-NEXT:    or t4, s2, s1
 ; RV32-NEXT:    or s1, s4, s3
-; RV32-NEXT:    and a7, s0, t6
+; RV32-NEXT:    and t2, s0, t6
 ; RV32-NEXT:    and t3, s7, s6
 ; RV32-NEXT:    bnez s5, .LBB2_16
 ; RV32-NEXT:  # %bb.15: # %_udiv-special-cases
@@ -747,125 +747,92 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    seqz a1, t4
 ; RV32-NEXT:    seqz t4, s1
 ; RV32-NEXT:    sltu t6, t5, a0
-; RV32-NEXT:    sub s1, a7, t3
+; RV32-NEXT:    sub s1, t2, t3
 ; RV32-NEXT:    mv s0, t6
-; RV32-NEXT:    beq a7, t3, .LBB2_18
+; RV32-NEXT:    beq t2, t3, .LBB2_18
 ; RV32-NEXT:  # %bb.17: # %_udiv-special-cases
-; RV32-NEXT:    sltu s0, a7, t3
+; RV32-NEXT:    sltu s0, t2, t3
 ; RV32-NEXT:  .LBB2_18: # %_udiv-special-cases
-; RV32-NEXT:    sub t3, s1, t6
+; RV32-NEXT:    sub t2, s1, t6
 ; RV32-NEXT:    or a1, a1, t4
 ; RV32-NEXT:    neg t6, s0
 ; RV32-NEXT:    seqz s0, s0
 ; RV32-NEXT:    addi s0, s0, -1
-; RV32-NEXT:    or a7, t6, s0
-; RV32-NEXT:    sub t4, t5, a0
-; RV32-NEXT:    beqz a7, .LBB2_20
+; RV32-NEXT:    or t4, t6, s0
+; RV32-NEXT:    sub t3, t5, a0
+; RV32-NEXT:    beqz t4, .LBB2_20
 ; RV32-NEXT:  # %bb.19: # %_udiv-special-cases
-; RV32-NEXT:    snez a0, a7
+; RV32-NEXT:    snez a0, t4
 ; RV32-NEXT:    j .LBB2_21
 ; RV32-NEXT:  .LBB2_20:
-; RV32-NEXT:    snez a0, t3
-; RV32-NEXT:    sltiu a7, t4, 128
-; RV32-NEXT:    xori a7, a7, 1
-; RV32-NEXT:    or a0, a7, a0
+; RV32-NEXT:    snez a0, t2
+; RV32-NEXT:    sltiu t4, t3, 128
+; RV32-NEXT:    xori t4, t4, 1
+; RV32-NEXT:    or a0, t4, a0
 ; RV32-NEXT:  .LBB2_21: # %_udiv-special-cases
 ; RV32-NEXT:    or s1, a1, a0
-; RV32-NEXT:    addi a1, s1, -1
-; RV32-NEXT:    and a7, a1, t0
-; RV32-NEXT:    and t5, a1, t1
-; RV32-NEXT:    and a0, a1, a6
-; RV32-NEXT:    and a1, a1, t2
-; RV32-NEXT:    bnez s1, .LBB2_25
+; RV32-NEXT:    addi t5, s1, -1
+; RV32-NEXT:    and a1, t5, a7
+; RV32-NEXT:    and t4, t5, t0
+; RV32-NEXT:    and a0, t5, a6
+; RV32-NEXT:    and t5, t5, t1
+; RV32-NEXT:    bnez s1, .LBB2_32
 ; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
-; RV32-NEXT:    xori s1, t4, 127
+; RV32-NEXT:    xori s1, t3, 127
 ; RV32-NEXT:    or s1, s1, t6
-; RV32-NEXT:    or s2, t3, s0
+; RV32-NEXT:    or s2, t2, s0
 ; RV32-NEXT:    or s1, s1, s2
-; RV32-NEXT:    beqz s1, .LBB2_25
+; RV32-NEXT:    beqz s1, .LBB2_32
 ; RV32-NEXT:  # %bb.23: # %udiv-bb1
-; RV32-NEXT:    sw a4, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    addi a7, t4, 1
+; RV32-NEXT:    sw a3, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi a1, t3, 1
 ; RV32-NEXT:    sw zero, 56(sp)
 ; RV32-NEXT:    sw zero, 60(sp)
 ; RV32-NEXT:    sw zero, 64(sp)
 ; RV32-NEXT:    sw zero, 68(sp)
-; RV32-NEXT:    sw t2, 72(sp)
+; RV32-NEXT:    sw t1, 72(sp)
 ; RV32-NEXT:    sw a6, 76(sp)
-; RV32-NEXT:    sw t1, 80(sp)
-; RV32-NEXT:    sw t0, 84(sp)
+; RV32-NEXT:    sw t0, 80(sp)
+; RV32-NEXT:    sw a7, 84(sp)
 ; RV32-NEXT:    li a0, 127
-; RV32-NEXT:    addi a2, sp, 72
-; RV32-NEXT:    seqz a4, a7
-; RV32-NEXT:    sub a0, a0, t4
-; RV32-NEXT:    add t3, t3, a4
+; RV32-NEXT:    addi a3, sp, 72
+; RV32-NEXT:    seqz a4, a1
+; RV32-NEXT:    sub a0, a0, t3
+; RV32-NEXT:    add t2, t2, a4
 ; RV32-NEXT:    andi a4, a0, 31
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    or t5, a7, t3
+; RV32-NEXT:    or t4, a1, t2
 ; RV32-NEXT:    xori a4, a4, 31
 ; RV32-NEXT:    andi a0, a0, 12
-; RV32-NEXT:    seqz t5, t5
-; RV32-NEXT:    sub a2, a2, a0
-; RV32-NEXT:    add t5, t6, t5
-; RV32-NEXT:    lw a0, 0(a2)
-; RV32-NEXT:    lw s1, 4(a2)
-; RV32-NEXT:    lw s3, 8(a2)
-; RV32-NEXT:    lw a2, 12(a2)
-; RV32-NEXT:    sltu t6, t5, t6
-; RV32-NEXT:    or s2, a7, t5
-; RV32-NEXT:    add t6, s0, t6
-; RV32-NEXT:    or s0, t3, t6
-; RV32-NEXT:    or s0, s2, s0
-; RV32-NEXT:    srli s2, s3, 1
-; RV32-NEXT:    srli s4, s1, 1
-; RV32-NEXT:    srli s5, a0, 1
-; RV32-NEXT:    srl s2, s2, a4
-; RV32-NEXT:    srl s4, s4, a4
-; RV32-NEXT:    srl a4, s5, a4
-; RV32-NEXT:    not t4, t4
-; RV32-NEXT:    sll a2, a2, t4
-; RV32-NEXT:    or s2, a2, s2
-; RV32-NEXT:    sll a2, s3, t4
-; RV32-NEXT:    or a2, a2, s4
-; RV32-NEXT:    sll s1, s1, t4
+; RV32-NEXT:    seqz t4, t4
+; RV32-NEXT:    sub a3, a3, a0
+; RV32-NEXT:    add t4, t6, t4
+; RV32-NEXT:    lw a0, 0(a3)
+; RV32-NEXT:    lw s1, 4(a3)
+; RV32-NEXT:    lw s3, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    sltu t5, t4, t6
+; RV32-NEXT:    or t6, a1, t4
+; RV32-NEXT:    add t5, s0, t5
+; RV32-NEXT:    or s0, t2, t5
+; RV32-NEXT:    or t6, t6, s0
+; RV32-NEXT:    srli s0, s3, 1
+; RV32-NEXT:    srli s2, s1, 1
+; RV32-NEXT:    srli s4, a0, 1
+; RV32-NEXT:    srl s0, s0, a4
+; RV32-NEXT:    srl s5, s2, a4
+; RV32-NEXT:    srl a4, s4, a4
+; RV32-NEXT:    not t3, t3
+; RV32-NEXT:    sll a3, a3, t3
+; RV32-NEXT:    or s2, a3, s0
+; RV32-NEXT:    sll a3, s3, t3
+; RV32-NEXT:    sll s1, s1, t3
+; RV32-NEXT:    or a3, a3, s5
 ; RV32-NEXT:    or s1, s1, a4
-; RV32-NEXT:    sll t4, a0, t4
-; RV32-NEXT:    li a1, 0
-; RV32-NEXT:    bnez s0, .LBB2_26
-; RV32-NEXT:  .LBB2_24: # %udiv-loop-exit
-; RV32-NEXT:    srli a0, t4, 31
-; RV32-NEXT:    slli a3, s1, 1
-; RV32-NEXT:    srli s1, s1, 31
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    slli a3, a2, 1
-; RV32-NEXT:    srli s0, a2, 31
-; RV32-NEXT:    slli s2, s2, 1
-; RV32-NEXT:    slli t4, t4, 1
-; RV32-NEXT:    or t5, a3, s1
-; RV32-NEXT:    or a7, s2, s0
-; RV32-NEXT:    or a1, a1, t4
-; RV32-NEXT:    lw a4, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:  .LBB2_25: # %udiv-end
-; RV32-NEXT:    sw a1, 0(a4)
-; RV32-NEXT:    sw a0, 4(a4)
-; RV32-NEXT:    sw t5, 8(a4)
-; RV32-NEXT:    sw a7, 12(a4)
-; RV32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 144
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB2_26: # %udiv-preheader
+; RV32-NEXT:    sll t3, a0, t3
+; RV32-NEXT:    beqz t6, .LBB2_31
+; RV32-NEXT:  # %bb.24: # %udiv-preheader
+; RV32-NEXT:    li t6, 0
 ; RV32-NEXT:    li s3, 0
 ; RV32-NEXT:    li s4, 0
 ; RV32-NEXT:    li s5, 0
@@ -873,118 +840,118 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    sw zero, 44(sp)
 ; RV32-NEXT:    sw zero, 48(sp)
 ; RV32-NEXT:    sw zero, 52(sp)
-; RV32-NEXT:    sw t2, 24(sp)
+; RV32-NEXT:    sw t1, 24(sp)
 ; RV32-NEXT:    sw a6, 28(sp)
-; RV32-NEXT:    sw t1, 32(sp)
-; RV32-NEXT:    sw t0, 36(sp)
-; RV32-NEXT:    srli a0, a7, 3
+; RV32-NEXT:    sw t0, 32(sp)
+; RV32-NEXT:    sw a7, 36(sp)
+; RV32-NEXT:    srli a0, a1, 3
 ; RV32-NEXT:    addi a4, sp, 24
 ; RV32-NEXT:    andi a0, a0, 12
 ; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    lw a4, 4(a0)
 ; RV32-NEXT:    lw a6, 8(a0)
-; RV32-NEXT:    lw t2, 12(a0)
+; RV32-NEXT:    lw t1, 12(a0)
 ; RV32-NEXT:    lw a0, 0(a0)
-; RV32-NEXT:    andi t0, a7, 31
-; RV32-NEXT:    xori t0, t0, 31
-; RV32-NEXT:    slli t1, t2, 1
+; RV32-NEXT:    andi a7, a1, 31
+; RV32-NEXT:    xori a7, a7, 31
+; RV32-NEXT:    slli t0, t1, 1
 ; RV32-NEXT:    slli s0, a6, 1
 ; RV32-NEXT:    slli s6, a4, 1
-; RV32-NEXT:    sll t1, t1, t0
-; RV32-NEXT:    sll s0, s0, t0
-; RV32-NEXT:    sll s8, s6, t0
-; RV32-NEXT:    seqz t0, ra
-; RV32-NEXT:    srl a6, a6, a7
-; RV32-NEXT:    or s6, a6, t1
-; RV32-NEXT:    or t1, ra, a5
+; RV32-NEXT:    sll t0, t0, a7
+; RV32-NEXT:    sll s0, s0, a7
+; RV32-NEXT:    sll s8, s6, a7
+; RV32-NEXT:    seqz a7, ra
+; RV32-NEXT:    srl a6, a6, a1
+; RV32-NEXT:    or s6, a6, t0
+; RV32-NEXT:    or t0, ra, a5
 ; RV32-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sub a6, a5, t0
-; RV32-NEXT:    seqz t1, t1
-; RV32-NEXT:    srl a4, a4, a7
+; RV32-NEXT:    sub a6, a5, a7
+; RV32-NEXT:    seqz t0, t0
+; RV32-NEXT:    srl a4, a4, a1
 ; RV32-NEXT:    or s7, a4, s0
-; RV32-NEXT:    sub t0, s9, t1
-; RV32-NEXT:    mv a3, s9
-; RV32-NEXT:    sltu a4, s9, t1
-; RV32-NEXT:    mv t1, s10
+; RV32-NEXT:    sub a7, s9, t0
+; RV32-NEXT:    mv a2, s9
+; RV32-NEXT:    sltu a4, s9, t0
+; RV32-NEXT:    mv t0, s10
 ; RV32-NEXT:    sub a4, s10, a4
 ; RV32-NEXT:    sw a4, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    srl a0, a0, a7
-; RV32-NEXT:    srl s9, t2, a7
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    srl s9, t1, a1
 ; RV32-NEXT:    or s8, a0, s8
 ; RV32-NEXT:    addi a0, ra, -1
 ; RV32-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    j .LBB2_28
-; RV32-NEXT:  .LBB2_27: # %udiv-do-while
-; RV32-NEXT:    # in Loop: Header=BB2_28 Depth=1
-; RV32-NEXT:    srli s0, a2, 31
+; RV32-NEXT:    j .LBB2_26
+; RV32-NEXT:  .LBB2_25: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_26 Depth=1
+; RV32-NEXT:    srli s0, a3, 31
 ; RV32-NEXT:    slli s2, s2, 1
 ; RV32-NEXT:    sub a0, s11, a0
 ; RV32-NEXT:    srli s11, s1, 31
-; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    slli a3, a3, 1
 ; RV32-NEXT:    or s0, s2, s0
-; RV32-NEXT:    srli s2, t4, 31
+; RV32-NEXT:    srli s2, t3, 31
 ; RV32-NEXT:    slli s1, s1, 1
-; RV32-NEXT:    slli t4, t4, 1
-; RV32-NEXT:    or a2, a2, s11
-; RV32-NEXT:    and s11, s7, t1
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    or a3, a3, s11
+; RV32-NEXT:    and s11, s7, t0
 ; RV32-NEXT:    or s1, s1, s2
-; RV32-NEXT:    and s2, s7, a3
-; RV32-NEXT:    or t4, a1, t4
-; RV32-NEXT:    sub a4, t2, s2
-; RV32-NEXT:    sltu t2, t2, s2
-; RV32-NEXT:    or s2, a7, t3
+; RV32-NEXT:    and s2, s7, a2
+; RV32-NEXT:    or t3, t6, t3
+; RV32-NEXT:    sub a4, t1, s2
+; RV32-NEXT:    sltu t1, t1, s2
+; RV32-NEXT:    or s2, a1, t2
 ; RV32-NEXT:    sub s11, s6, s11
-; RV32-NEXT:    seqz s6, a7
-; RV32-NEXT:    addi a7, a7, -1
-; RV32-NEXT:    andi a1, s7, 1
+; RV32-NEXT:    seqz s6, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi t6, s7, 1
 ; RV32-NEXT:    sub s7, a0, ra
 ; RV32-NEXT:    seqz a0, s2
-; RV32-NEXT:    sub t3, t3, s6
+; RV32-NEXT:    sub t2, t2, s6
 ; RV32-NEXT:    or s1, s3, s1
-; RV32-NEXT:    or a2, s4, a2
+; RV32-NEXT:    or a3, s4, a3
 ; RV32-NEXT:    or s2, s5, s0
 ; RV32-NEXT:    sub s6, a4, s9
 ; RV32-NEXT:    sltu a4, a4, s9
-; RV32-NEXT:    sub t2, s11, t2
-; RV32-NEXT:    sltu s0, t5, a0
-; RV32-NEXT:    sub t5, t5, a0
-; RV32-NEXT:    sub s9, t2, a4
-; RV32-NEXT:    sub t6, t6, s0
-; RV32-NEXT:    or a0, t3, t6
-; RV32-NEXT:    or a4, a7, t5
+; RV32-NEXT:    sub t1, s11, t1
+; RV32-NEXT:    sltu s0, t4, a0
+; RV32-NEXT:    sub t4, t4, a0
+; RV32-NEXT:    sub s9, t1, a4
+; RV32-NEXT:    sub t5, t5, s0
+; RV32-NEXT:    or a0, t2, t5
+; RV32-NEXT:    or a4, a1, t4
 ; RV32-NEXT:    or a0, a4, a0
 ; RV32-NEXT:    sub s8, s8, s10
 ; RV32-NEXT:    li s3, 0
 ; RV32-NEXT:    li s4, 0
 ; RV32-NEXT:    li s5, 0
 ; RV32-NEXT:    mv ra, a5
-; RV32-NEXT:    beqz a0, .LBB2_24
-; RV32-NEXT:  .LBB2_28: # %udiv-do-while
+; RV32-NEXT:    beqz a0, .LBB2_31
+; RV32-NEXT:  .LBB2_26: # %udiv-do-while
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    srli a0, s8, 31
-; RV32-NEXT:    slli t2, s7, 1
+; RV32-NEXT:    slli t1, s7, 1
 ; RV32-NEXT:    slli s8, s8, 1
-; RV32-NEXT:    or s11, t2, a0
+; RV32-NEXT:    or s11, t1, a0
 ; RV32-NEXT:    srli a0, s2, 31
 ; RV32-NEXT:    or s8, s8, a0
-; RV32-NEXT:    beq a6, s11, .LBB2_30
-; RV32-NEXT:  # %bb.29: # %udiv-do-while
-; RV32-NEXT:    # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT:    beq a6, s11, .LBB2_28
+; RV32-NEXT:  # %bb.27: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_26 Depth=1
 ; RV32-NEXT:    sltu a0, a6, s11
-; RV32-NEXT:    j .LBB2_31
-; RV32-NEXT:  .LBB2_30: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT:    j .LBB2_29
+; RV32-NEXT:  .LBB2_28: # in Loop: Header=BB2_26 Depth=1
 ; RV32-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    sltu a0, a0, s8
-; RV32-NEXT:  .LBB2_31: # %udiv-do-while
-; RV32-NEXT:    # in Loop: Header=BB2_28 Depth=1
-; RV32-NEXT:    srli t2, s6, 31
+; RV32-NEXT:  .LBB2_29: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_26 Depth=1
+; RV32-NEXT:    srli t1, s6, 31
 ; RV32-NEXT:    slli s9, s9, 1
 ; RV32-NEXT:    srli s7, s7, 31
 ; RV32-NEXT:    slli s10, s6, 1
-; RV32-NEXT:    or s6, s9, t2
-; RV32-NEXT:    or t2, s10, s7
-; RV32-NEXT:    sub s7, t0, t2
-; RV32-NEXT:    sltu s9, t0, t2
+; RV32-NEXT:    or s6, s9, t1
+; RV32-NEXT:    or t1, s10, s7
+; RV32-NEXT:    sub s7, a7, t1
+; RV32-NEXT:    sltu s9, a7, t1
 ; RV32-NEXT:    lw a4, 16(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    sub s10, a4, s6
 ; RV32-NEXT:    sltu a0, s7, a0
@@ -997,11 +964,44 @@ define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
 ; RV32-NEXT:    and a0, s7, a0
 ; RV32-NEXT:    sltu ra, s8, s10
 ; RV32-NEXT:    mv s9, ra
-; RV32-NEXT:    beq s11, a0, .LBB2_27
-; RV32-NEXT:  # %bb.32: # %udiv-do-while
-; RV32-NEXT:    # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT:    beq s11, a0, .LBB2_25
+; RV32-NEXT:  # %bb.30: # %udiv-do-while
+; RV32-NEXT:    # in Loop: Header=BB2_26 Depth=1
 ; RV32-NEXT:    sltu s9, s11, a0
-; RV32-NEXT:    j .LBB2_27
+; RV32-NEXT:    j .LBB2_25
+; RV32-NEXT:  .LBB2_31: # %udiv-loop-exit
+; RV32-NEXT:    srli a0, t3, 31
+; RV32-NEXT:    slli a1, s1, 1
+; RV32-NEXT:    srli s1, s1, 31
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    slli a1, a3, 1
+; RV32-NEXT:    srli s0, a3, 31
+; RV32-NEXT:    slli s2, s2, 1
+; RV32-NEXT:    slli t3, t3, 1
+; RV32-NEXT:    or t4, a1, s1
+; RV32-NEXT:    or a1, s2, s0
+; RV32-NEXT:    or t5, t6, t3
+; RV32-NEXT:    lw a3, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:  .LBB2_32: # %udiv-end
+; RV32-NEXT:    sw t5, 0(a3)
+; RV32-NEXT:    sw a0, 4(a3)
+; RV32-NEXT:    sw t4, 8(a3)
+; RV32-NEXT:    sw a1, 12(a3)
+; RV32-NEXT:    lw ra, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 144
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: udiv_i128:
 ; RV64:       # %bb.0:
@@ -1038,20 +1038,20 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    lw a5, 4(a2)
 ; RV32-NEXT:    lw a6, 8(a2)
 ; RV32-NEXT:    lw a0, 12(a2)
-; RV32-NEXT:    sw a0, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    lui a0, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi t5, a0, 1365
 ; RV32-NEXT:    addi t4, a2, 819
 ; RV32-NEXT:    addi t3, a3, -241
-; RV32-NEXT:    sw a6, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a6, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    slli a0, a6, 31
 ; RV32-NEXT:    srli a2, a5, 1
-; RV32-NEXT:    sw a5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    slli a3, a5, 31
 ; RV32-NEXT:    or a6, a2, a0
-; RV32-NEXT:    sw a4, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw a4, 32(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    srli a0, a4, 1
 ; RV32-NEXT:    or a7, a0, a3
 ; RV32-NEXT:    bnez a6, .LBB3_2
@@ -1082,7 +1082,7 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    slli a2, a0, 16
 ; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    srli a0, a0, 24
-; RV32-NEXT:    addi a4, a0, 32
+; RV32-NEXT:    addi a5, a0, 32
 ; RV32-NEXT:    j .LBB3_3
 ; RV32-NEXT:  .LBB3_2:
 ; RV32-NEXT:    srli a0, a6, 1
@@ -1110,20 +1110,20 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    slli a2, a0, 16
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    srli a4, a0, 24
+; RV32-NEXT:    srli a5, a0, 24
 ; RV32-NEXT:  .LBB3_3: # %_udiv-special-cases
-; RV32-NEXT:    lw a5, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    srli a0, a5, 1
+; RV32-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    srli a0, a4, 1
 ; RV32-NEXT:    slli a3, t2, 31
-; RV32-NEXT:    slli a5, a5, 31
-; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    slli a4, a4, 31
+; RV32-NEXT:    lw a2, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    srli t0, a2, 1
-; RV32-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    slli a2, a2, 31
 ; RV32-NEXT:    li s2, 64
 ; RV32-NEXT:    bnez a2, .LBB3_5
 ; RV32-NEXT:  # %bb.4: # %_udiv-special-cases
-; RV32-NEXT:    li t1, 64
+; RV32-NEXT:    li t6, 64
 ; RV32-NEXT:    j .LBB3_6
 ; RV32-NEXT:  .LBB3_5:
 ; RV32-NEXT:    srli t1, a2, 1
@@ -1151,91 +1151,91 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    add t1, t1, t6
 ; RV32-NEXT:    slli t6, t1, 16
 ; RV32-NEXT:    add t1, t1, t6
-; RV32-NEXT:    srli t1, t1, 24
+; RV32-NEXT:    srli t6, t1, 24
 ; RV32-NEXT:  .LBB3_6: # %_udiv-special-cases
-; RV32-NEXT:    or a3, a3, a0
-; RV32-NEXT:    or a5, t0, a5
+; RV32-NEXT:    or t1, a3, a0
+; RV32-NEXT:    or t0, t0, a4
 ; RV32-NEXT:    bnez a2, .LBB3_8
 ; RV32-NEXT:  # %bb.7: # %_udiv-special-cases
-; RV32-NEXT:    li t1, 128
+; RV32-NEXT:    li t6, 128
 ; RV32-NEXT:  .LBB3_8: # %_udiv-special-cases
-; RV32-NEXT:    or t0, a5, a3
-; RV32-NEXT:    addi a2, a4, 64
-; RV32-NEXT:    addi a0, t1, 128
-; RV32-NEXT:    or a6, a6, a3
-; RV32-NEXT:    or a7, a7, a5
-; RV32-NEXT:    or s3, a7, a6
-; RV32-NEXT:    sltu s0, a0, t1
+; RV32-NEXT:    or a4, t0, t1
+; RV32-NEXT:    addi a3, a5, 64
+; RV32-NEXT:    addi a0, t6, 128
+; RV32-NEXT:    or a2, a6, t1
+; RV32-NEXT:    or a6, a7, t0
+; RV32-NEXT:    or s3, a6, a2
+; RV32-NEXT:    sltu s0, a0, t6
 ; RV32-NEXT:    bnez s3, .LBB3_11
 ; RV32-NEXT:  # %bb.9: # %_udiv-special-cases
 ; RV32-NEXT:    mv t6, s0
-; RV32-NEXT:    beqz a3, .LBB3_12
+; RV32-NEXT:    beqz t1, .LBB3_12
 ; RV32-NEXT:  .LBB3_10:
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 2
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 16
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    not a3, a3
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    and a4, a4, t5
-; RV32-NEXT:    sub a3, a3, a4
-; RV32-NEXT:    and a4, a3, t4
-; RV32-NEXT:    srli a3, a3, 2
-; RV32-NEXT:    and a3, a3, t4
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    and a3, a3, t3
-; RV32-NEXT:    slli a4, a3, 8
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    slli a4, a3, 16
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    srli s1, a3, 24
-; RV32-NEXT:    beqz t0, .LBB3_13
+; RV32-NEXT:    srli a2, t1, 1
+; RV32-NEXT:    or a2, t1, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t5
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, t4
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, t4
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, t3
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli s1, a2, 24
+; RV32-NEXT:    beqz a4, .LBB3_13
 ; RV32-NEXT:    j .LBB3_14
 ; RV32-NEXT:  .LBB3_11:
-; RV32-NEXT:    snez a6, t0
-; RV32-NEXT:    sltu a4, a2, a4
-; RV32-NEXT:    addi a6, a6, -1
-; RV32-NEXT:    and t6, a6, a4
-; RV32-NEXT:    bnez a3, .LBB3_10
+; RV32-NEXT:    snez a2, a4
+; RV32-NEXT:    sltu a5, a3, a5
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and t6, a2, a5
+; RV32-NEXT:    bnez t1, .LBB3_10
 ; RV32-NEXT:  .LBB3_12: # %_udiv-special-cases
-; RV32-NEXT:    srli a3, a5, 1
-; RV32-NEXT:    or a3, a5, a3
-; RV32-NEXT:    srli a4, a3, 2
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 16
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    not a3, a3
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    and a4, a4, t5
-; RV32-NEXT:    sub a3, a3, a4
-; RV32-NEXT:    and a4, a3, t4
-; RV32-NEXT:    srli a3, a3, 2
-; RV32-NEXT:    and a3, a3, t4
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    and a3, a3, t3
-; RV32-NEXT:    slli a4, a3, 8
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    slli a4, a3, 16
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    srli a3, a3, 24
-; RV32-NEXT:    addi s1, a3, 32
-; RV32-NEXT:    bnez t0, .LBB3_14
+; RV32-NEXT:    srli a2, t0, 1
+; RV32-NEXT:    or a2, t0, a2
+; RV32-NEXT:    srli a5, a2, 2
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 8
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    srli a5, a2, 16
+; RV32-NEXT:    or a2, a2, a5
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a5, a2, 1
+; RV32-NEXT:    and a5, a5, t5
+; RV32-NEXT:    sub a2, a2, a5
+; RV32-NEXT:    and a5, a2, t4
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, t4
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    srli a5, a2, 4
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    and a2, a2, t3
+; RV32-NEXT:    slli a5, a2, 8
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    slli a5, a2, 16
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    srli a2, a2, 24
+; RV32-NEXT:    addi s1, a2, 32
+; RV32-NEXT:    bnez a4, .LBB3_14
 ; RV32-NEXT:  .LBB3_13: # %_udiv-special-cases
-; RV32-NEXT:    mv s1, a2
+; RV32-NEXT:    mv s1, a3
 ; RV32-NEXT:  .LBB3_14: # %_udiv-special-cases
 ; RV32-NEXT:    lw a7, 0(a1)
 ; RV32-NEXT:    lw t0, 4(a1)
@@ -1344,179 +1344,179 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    add s2, s2, s7
 ; RV32-NEXT:    srli s2, s2, 24
 ; RV32-NEXT:  .LBB3_21: # %_udiv-special-cases
-; RV32-NEXT:    or s7, a2, a0
-; RV32-NEXT:    or a3, s6, a3
+; RV32-NEXT:    or a2, a2, a0
+; RV32-NEXT:    or s7, s6, a3
 ; RV32-NEXT:    bnez a5, .LBB3_23
 ; RV32-NEXT:  # %bb.22: # %_udiv-special-cases
 ; RV32-NEXT:    li s2, 128
 ; RV32-NEXT:  .LBB3_23: # %_udiv-special-cases
-; RV32-NEXT:    or a2, a3, s7
+; RV32-NEXT:    or a3, s7, a2
 ; RV32-NEXT:    addi a0, a4, 64
 ; RV32-NEXT:    addi s6, s2, 128
-; RV32-NEXT:    or a5, s4, s7
-; RV32-NEXT:    or s4, s5, a3
+; RV32-NEXT:    or a5, s4, a2
+; RV32-NEXT:    or s4, s5, s7
 ; RV32-NEXT:    or s5, s4, a5
 ; RV32-NEXT:    sltu s4, s6, s2
 ; RV32-NEXT:    bnez s5, .LBB3_26
 ; RV32-NEXT:  # %bb.24: # %_udiv-special-cases
 ; RV32-NEXT:    mv s2, s4
-; RV32-NEXT:    snez s3, s3
-; RV32-NEXT:    beqz s7, .LBB3_27
+; RV32-NEXT:    snez a5, s3
+; RV32-NEXT:    beqz a2, .LBB3_27
 ; RV32-NEXT:  .LBB3_25:
-; RV32-NEXT:    srli a3, s7, 1
-; RV32-NEXT:    or a3, s7, a3
-; RV32-NEXT:    srli a4, a3, 2
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 16
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    not a3, a3
-; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    srli a4, a2, 1
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 2
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 4
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 8
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 16
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a4, a2, 1
 ; RV32-NEXT:    and a4, a4, t5
-; RV32-NEXT:    sub a3, a3, a4
-; RV32-NEXT:    and a4, a3, t4
-; RV32-NEXT:    srli a3, a3, 2
-; RV32-NEXT:    and a3, a3, t4
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    and a3, a3, t3
-; RV32-NEXT:    slli a4, a3, 8
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    slli a4, a3, 16
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 24
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    and a4, a2, t4
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, t4
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    srli a4, a2, 4
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    and a2, a2, t3
+; RV32-NEXT:    slli a4, a2, 8
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    slli a4, a2, 16
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 24
 ; RV32-NEXT:    j .LBB3_28
 ; RV32-NEXT:  .LBB3_26:
-; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    snez a5, a3
 ; RV32-NEXT:    sltu a4, a0, a4
 ; RV32-NEXT:    addi a5, a5, -1
 ; RV32-NEXT:    and s2, a5, a4
-; RV32-NEXT:    snez s3, s3
-; RV32-NEXT:    bnez s7, .LBB3_25
+; RV32-NEXT:    snez a5, s3
+; RV32-NEXT:    bnez a2, .LBB3_25
 ; RV32-NEXT:  .LBB3_27: # %_udiv-special-cases
-; RV32-NEXT:    srli a4, a3, 1
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 2
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    srli a4, a3, 16
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    not a3, a3
-; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    srli a2, s7, 1
+; RV32-NEXT:    or a2, s7, a2
+; RV32-NEXT:    srli a4, a2, 2
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 4
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 8
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    srli a4, a2, 16
+; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    srli a4, a2, 1
 ; RV32-NEXT:    and a4, a4, t5
-; RV32-NEXT:    sub a3, a3, a4
-; RV32-NEXT:    and a4, a3, t4
-; RV32-NEXT:    srli a3, a3, 2
-; RV32-NEXT:    and a3, a3, t4
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    srli a4, a3, 4
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    and a3, a3, t3
-; RV32-NEXT:    slli a4, a3, 8
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    slli a4, a3, 16
-; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    srli a3, a3, 24
-; RV32-NEXT:    addi a4, a3, 32
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    and a4, a2, t4
+; RV32-NEXT:    srli a2, a2, 2
+; RV32-NEXT:    and a2, a2, t4
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    srli a4, a2, 4
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    and a2, a2, t3
+; RV32-NEXT:    slli a4, a2, 8
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    slli a4, a2, 16
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    srli a2, a2, 24
+; RV32-NEXT:    addi a4, a2, 32
 ; RV32-NEXT:  .LBB3_28: # %_udiv-special-cases
-; RV32-NEXT:    xori a3, s0, 1
-; RV32-NEXT:    addi s3, s3, -1
-; RV32-NEXT:    bnez a2, .LBB3_30
+; RV32-NEXT:    xori a2, s0, 1
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    bnez a3, .LBB3_30
 ; RV32-NEXT:  # %bb.29: # %_udiv-special-cases
 ; RV32-NEXT:    mv a4, a0
 ; RV32-NEXT:  .LBB3_30: # %_udiv-special-cases
 ; RV32-NEXT:    andi s11, a1, 1
 ; RV32-NEXT:    andi a0, t2, 1
-; RV32-NEXT:    lw a1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    or s9, a1, a2
-; RV32-NEXT:    or a5, a7, a6
-; RV32-NEXT:    neg a1, a3
-; RV32-NEXT:    and t2, s3, s0
+; RV32-NEXT:    lw a1, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s9, a1, a3
+; RV32-NEXT:    or a3, a7, a6
+; RV32-NEXT:    neg a1, a2
+; RV32-NEXT:    and s0, a5, s0
 ; RV32-NEXT:    bnez s5, .LBB3_32
 ; RV32-NEXT:  # %bb.31: # %_udiv-special-cases
 ; RV32-NEXT:    mv a4, s6
 ; RV32-NEXT:  .LBB3_32: # %_udiv-special-cases
-; RV32-NEXT:    lw a2, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw a3, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    or s10, a2, a3
+; RV32-NEXT:    lw a2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s10, a2, a5
 ; RV32-NEXT:    or a2, s9, a0
-; RV32-NEXT:    or a3, t0, t1
-; RV32-NEXT:    or t4, a5, s11
-; RV32-NEXT:    and a1, t2, a1
-; RV32-NEXT:    xori a5, s4, 1
+; RV32-NEXT:    or a5, t0, t1
+; RV32-NEXT:    or t4, a3, s11
+; RV32-NEXT:    and a1, s0, a1
+; RV32-NEXT:    xori a3, s4, 1
 ; RV32-NEXT:    snez t2, s5
-; RV32-NEXT:    neg a5, a5
+; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    addi t2, t2, -1
 ; RV32-NEXT:    and t3, t2, s4
 ; RV32-NEXT:    sltu t2, s1, a4
-; RV32-NEXT:    and t3, t3, a5
-; RV32-NEXT:    mv a5, t2
+; RV32-NEXT:    and t3, t3, a3
+; RV32-NEXT:    mv a3, t2
 ; RV32-NEXT:    beq t6, s2, .LBB3_34
 ; RV32-NEXT:  # %bb.33: # %_udiv-special-cases
-; RV32-NEXT:    sltu a5, t6, s2
+; RV32-NEXT:    sltu a3, t6, s2
 ; RV32-NEXT:  .LBB3_34: # %_udiv-special-cases
 ; RV32-NEXT:    or a2, a2, s10
-; RV32-NEXT:    or a3, t4, a3
+; RV32-NEXT:    or a5, t4, a5
 ; RV32-NEXT:    sltu t5, a1, t3
-; RV32-NEXT:    mv t4, a5
+; RV32-NEXT:    mv t4, a3
 ; RV32-NEXT:    beq a1, t3, .LBB3_36
 ; RV32-NEXT:  # %bb.35: # %_udiv-special-cases
 ; RV32-NEXT:    mv t4, t5
 ; RV32-NEXT:  .LBB3_36: # %_udiv-special-cases
 ; RV32-NEXT:    seqz a2, a2
-; RV32-NEXT:    seqz a3, a3
+; RV32-NEXT:    seqz a5, a5
 ; RV32-NEXT:    andi t4, t4, 1
 ; RV32-NEXT:    sub t6, t6, s2
 ; RV32-NEXT:    sub a1, a1, t3
 ; RV32-NEXT:    sub t2, t6, t2
-; RV32-NEXT:    sltu t3, a1, a5
+; RV32-NEXT:    sltu t3, a1, a3
 ; RV32-NEXT:    add t3, t5, t3
 ; RV32-NEXT:    neg t3, t3
-; RV32-NEXT:    sub t5, a1, a5
+; RV32-NEXT:    sub t5, a1, a3
 ; RV32-NEXT:    or a1, t5, t3
-; RV32-NEXT:    sub t6, s1, a4
+; RV32-NEXT:    sub a3, s1, a4
 ; RV32-NEXT:    beqz a1, .LBB3_38
 ; RV32-NEXT:  # %bb.37: # %_udiv-special-cases
 ; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a2, a2, a5
 ; RV32-NEXT:    bnez t4, .LBB3_39
 ; RV32-NEXT:    j .LBB3_40
 ; RV32-NEXT:  .LBB3_38:
 ; RV32-NEXT:    snez a1, t2
-; RV32-NEXT:    sltiu a4, t6, 129
+; RV32-NEXT:    sltiu a4, a3, 129
 ; RV32-NEXT:    xori a4, a4, 1
 ; RV32-NEXT:    or a1, a4, a1
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    or a2, a2, a5
 ; RV32-NEXT:    beqz t4, .LBB3_40
 ; RV32-NEXT:  .LBB3_39: # %_udiv-special-cases
 ; RV32-NEXT:    mv a1, t4
 ; RV32-NEXT:  .LBB3_40: # %_udiv-special-cases
-; RV32-NEXT:    or a5, a2, a1
-; RV32-NEXT:    addi a4, a5, -1
-; RV32-NEXT:    and s0, s11, a4
-; RV32-NEXT:    and a3, a4, t1
-; RV32-NEXT:    and a2, a4, a6
-; RV32-NEXT:    and a1, a4, t0
-; RV32-NEXT:    and a4, a4, a7
-; RV32-NEXT:    bnez a5, .LBB3_57
+; RV32-NEXT:    or t6, a2, a1
+; RV32-NEXT:    addi a5, t6, -1
+; RV32-NEXT:    and s0, s11, a5
+; RV32-NEXT:    and a4, a5, t1
+; RV32-NEXT:    and a2, a5, a6
+; RV32-NEXT:    and a1, a5, t0
+; RV32-NEXT:    and a5, a5, a7
+; RV32-NEXT:    bnez t6, .LBB3_57
 ; RV32-NEXT:  # %bb.41: # %_udiv-special-cases
-; RV32-NEXT:    or a5, t2, t3
-; RV32-NEXT:    xori s1, t6, 128
+; RV32-NEXT:    or t6, t2, t3
+; RV32-NEXT:    xori s1, a3, 128
 ; RV32-NEXT:    or s1, s1, t4
 ; RV32-NEXT:    or s1, s1, t5
-; RV32-NEXT:    or a5, s1, a5
-; RV32-NEXT:    beqz a5, .LBB3_57
+; RV32-NEXT:    or t6, s1, t6
+; RV32-NEXT:    beqz t6, .LBB3_57
 ; RV32-NEXT:  # %bb.42: # %udiv-bb1
 ; RV32-NEXT:    sw s8, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    addi a1, t6, 1
+; RV32-NEXT:    addi a1, a3, 1
 ; RV32-NEXT:    sw zero, 136(sp)
 ; RV32-NEXT:    sw zero, 140(sp)
 ; RV32-NEXT:    sw zero, 144(sp)
@@ -1531,23 +1531,23 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    sw t1, 164(sp)
 ; RV32-NEXT:    sw s11, 168(sp)
 ; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    addi a3, sp, 152
-; RV32-NEXT:    neg ra, t6
-; RV32-NEXT:    seqz a4, a1
-; RV32-NEXT:    sub a2, a2, t6
-; RV32-NEXT:    add t2, t2, a4
-; RV32-NEXT:    andi a4, a2, 31
+; RV32-NEXT:    addi a4, sp, 152
+; RV32-NEXT:    neg ra, a3
+; RV32-NEXT:    seqz a5, a1
+; RV32-NEXT:    sub a2, a2, a3
+; RV32-NEXT:    add t2, t2, a5
+; RV32-NEXT:    andi a3, a2, 31
 ; RV32-NEXT:    srli a2, a2, 3
 ; RV32-NEXT:    or a5, a1, t2
-; RV32-NEXT:    xori s8, a4, 31
+; RV32-NEXT:    xori s8, a3, 31
 ; RV32-NEXT:    andi a2, a2, 28
 ; RV32-NEXT:    seqz t6, a5
-; RV32-NEXT:    sub a2, a3, a2
+; RV32-NEXT:    sub a3, a4, a2
 ; RV32-NEXT:    add t6, t5, t6
-; RV32-NEXT:    lw a3, 0(a2)
-; RV32-NEXT:    lw a5, 4(a2)
-; RV32-NEXT:    lw s1, 8(a2)
-; RV32-NEXT:    lw a4, 12(a2)
+; RV32-NEXT:    lw a2, 0(a3)
+; RV32-NEXT:    lw a5, 4(a3)
+; RV32-NEXT:    lw s1, 8(a3)
+; RV32-NEXT:    lw a4, 12(a3)
 ; RV32-NEXT:    sltu t5, t6, t5
 ; RV32-NEXT:    or s0, a1, t6
 ; RV32-NEXT:    add t3, t3, t5
@@ -1563,22 +1563,22 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    sll s1, s1, ra
 ; RV32-NEXT:    srl s2, s2, s8
 ; RV32-NEXT:    or s2, s1, s2
-; RV32-NEXT:    srli s1, a3, 1
+; RV32-NEXT:    srli s1, a2, 1
 ; RV32-NEXT:    sll a5, a5, ra
 ; RV32-NEXT:    srl s3, s1, s8
 ; RV32-NEXT:    andi s1, t4, 1
 ; RV32-NEXT:    or s3, a5, s3
 ; RV32-NEXT:    or a5, t5, s1
-; RV32-NEXT:    sll t5, a3, ra
+; RV32-NEXT:    sll t5, a2, ra
 ; RV32-NEXT:    beqz a5, .LBB3_55
 ; RV32-NEXT:  # %bb.43: # %udiv-preheader
 ; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    li s7, 0
 ; RV32-NEXT:    srli a4, a4, 1
-; RV32-NEXT:    lw a2, 16(a2)
+; RV32-NEXT:    lw a3, 16(a3)
 ; RV32-NEXT:    sw zero, 104(sp)
 ; RV32-NEXT:    sw zero, 108(sp)
 ; RV32-NEXT:    sw zero, 112(sp)
@@ -1595,82 +1595,81 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    sw t0, 60(sp)
 ; RV32-NEXT:    sw a6, 64(sp)
 ; RV32-NEXT:    sw t1, 68(sp)
-; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    srli a2, a1, 3
 ; RV32-NEXT:    addi a5, sp, 56
 ; RV32-NEXT:    andi a6, a1, 31
 ; RV32-NEXT:    or a7, s9, s10
 ; RV32-NEXT:    srl a4, a4, s8
-; RV32-NEXT:    andi a3, a3, 28
+; RV32-NEXT:    andi a2, a2, 28
 ; RV32-NEXT:    xori a6, a6, 31
 ; RV32-NEXT:    snez a7, a7
-; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a2, a5, a2
 ; RV32-NEXT:    add a0, a0, a7
-; RV32-NEXT:    lw a5, 16(a3)
-; RV32-NEXT:    lw a7, 0(a3)
-; RV32-NEXT:    lw t0, 4(a3)
-; RV32-NEXT:    lw t1, 8(a3)
-; RV32-NEXT:    lw a3, 12(a3)
-; RV32-NEXT:    sll a2, a2, ra
-; RV32-NEXT:    or a2, a2, a4
+; RV32-NEXT:    lw a5, 16(a2)
+; RV32-NEXT:    lw a7, 0(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    sll a3, a3, ra
+; RV32-NEXT:    or a3, a3, a4
 ; RV32-NEXT:    slli a5, a5, 1
-; RV32-NEXT:    slli a4, a3, 1
+; RV32-NEXT:    slli a4, a2, 1
 ; RV32-NEXT:    slli t4, t1, 1
 ; RV32-NEXT:    slli s4, t0, 1
 ; RV32-NEXT:    sll a5, a5, a6
 ; RV32-NEXT:    sll a4, a4, a6
 ; RV32-NEXT:    sll t4, t4, a6
 ; RV32-NEXT:    sll a6, s4, a6
-; RV32-NEXT:    srl a3, a3, a1
-; RV32-NEXT:    or s9, a3, a5
-; RV32-NEXT:    lw s4, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    seqz a3, s4
+; RV32-NEXT:    srl a2, a2, a1
+; RV32-NEXT:    or s9, a2, a5
+; RV32-NEXT:    lw s4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    seqz a2, s4
 ; RV32-NEXT:    srl a5, t1, a1
 ; RV32-NEXT:    or ra, a5, a4
-; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a5, 20(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    or a4, s4, a5
-; RV32-NEXT:    sub a5, a5, a3
-; RV32-NEXT:    seqz a3, a4
+; RV32-NEXT:    sub s5, a5, a2
+; RV32-NEXT:    seqz a2, a4
 ; RV32-NEXT:    srl a4, t0, a1
 ; RV32-NEXT:    or s11, a4, t4
-; RV32-NEXT:    lw a4, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    sub t0, a4, a3
-; RV32-NEXT:    sw t0, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sltu a3, a4, a3
-; RV32-NEXT:    addi a0, a0, 1
 ; RV32-NEXT:    lw a4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    sub s5, a4, a3
+; RV32-NEXT:    sub a5, a4, a2
+; RV32-NEXT:    sw a5, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sltu a2, a4, a2
+; RV32-NEXT:    addi a0, a0, 1
+; RV32-NEXT:    lw a4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub s6, a4, a2
 ; RV32-NEXT:    andi a0, a0, 1
 ; RV32-NEXT:    sw a0, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    andi a0, a2, 1
+; RV32-NEXT:    andi a0, a3, 1
 ; RV32-NEXT:    srl a2, a7, a1
-; RV32-NEXT:    or s8, a2, a6
+; RV32-NEXT:    or a3, a2, a6
 ; RV32-NEXT:    addi s4, s4, -1
 ; RV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw a5, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    j .LBB3_45
 ; RV32-NEXT:  .LBB3_44: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    lw a2, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    and t1, a0, a2
-; RV32-NEXT:    xor a2, a6, a3
+; RV32-NEXT:    lw a6, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and t1, a0, a6
+; RV32-NEXT:    xor a6, a4, s8
 ; RV32-NEXT:    xor a7, ra, t1
-; RV32-NEXT:    or a2, a7, a2
-; RV32-NEXT:    srli a2, s2, 31
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    srli a6, s2, 31
 ; RV32-NEXT:    sltu a7, ra, t1
 ; RV32-NEXT:    sub t1, ra, t1
 ; RV32-NEXT:    slli ra, s0, 1
-; RV32-NEXT:    sub a3, a6, a3
-; RV32-NEXT:    srli a6, s3, 31
+; RV32-NEXT:    sub a4, a4, s8
+; RV32-NEXT:    srli s8, s3, 31
 ; RV32-NEXT:    slli s2, s2, 1
-; RV32-NEXT:    sub a5, s11, a5
+; RV32-NEXT:    sub a2, s11, a2
 ; RV32-NEXT:    srli s11, t5, 31
 ; RV32-NEXT:    slli s3, s3, 1
 ; RV32-NEXT:    srli s0, s0, 31
 ; RV32-NEXT:    slli t5, t5, 1
-; RV32-NEXT:    or a2, ra, a2
+; RV32-NEXT:    or a6, ra, a6
 ; RV32-NEXT:    or t0, a1, t6
-; RV32-NEXT:    or a6, s2, a6
-; RV32-NEXT:    or s2, t2, t3
+; RV32-NEXT:    or s2, s2, s8
+; RV32-NEXT:    or s8, t2, t3
 ; RV32-NEXT:    or s3, s3, s11
 ; RV32-NEXT:    or t4, a1, t2
 ; RV32-NEXT:    lw s4, 52(sp) # 4-byte Folded Reload
@@ -1682,98 +1681,98 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    sw a0, 52(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sub ra, t1, s9
 ; RV32-NEXT:    sltu t1, t1, s9
-; RV32-NEXT:    sub a3, a3, a7
-; RV32-NEXT:    sub s11, a5, a4
-; RV32-NEXT:    or a4, t0, s2
+; RV32-NEXT:    sub a4, a4, a7
+; RV32-NEXT:    sub s11, a2, a5
+; RV32-NEXT:    or a2, t0, s8
 ; RV32-NEXT:    seqz a5, t4
 ; RV32-NEXT:    sub t2, t2, s4
 ; RV32-NEXT:    lw a0, 48(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    or s3, a0, s3
 ; RV32-NEXT:    lw a0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    or s2, a0, a6
-; RV32-NEXT:    or s0, s6, a2
+; RV32-NEXT:    or s2, a0, s2
+; RV32-NEXT:    lw a0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    or s0, a0, a6
 ; RV32-NEXT:    andi a0, s7, 1
-; RV32-NEXT:    sub s9, a3, t1
-; RV32-NEXT:    snez a2, a4
-; RV32-NEXT:    sltu a3, t6, a5
+; RV32-NEXT:    sub s9, a4, t1
+; RV32-NEXT:    snez a2, a2
+; RV32-NEXT:    sltu a4, t6, a5
 ; RV32-NEXT:    sub t6, t6, a5
 ; RV32-NEXT:    add a2, s1, a2
-; RV32-NEXT:    sub t3, t3, a3
-; RV32-NEXT:    or a3, a1, t6
+; RV32-NEXT:    sub t3, t3, a4
+; RV32-NEXT:    or a4, a1, t6
 ; RV32-NEXT:    addi a2, a2, 1
-; RV32-NEXT:    or a4, t2, t3
+; RV32-NEXT:    or a5, t2, t3
 ; RV32-NEXT:    andi s1, a2, 1
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    or a3, a3, s1
-; RV32-NEXT:    sub s8, s10, s8
+; RV32-NEXT:    or a4, a4, a5
+; RV32-NEXT:    or a4, a4, s1
+; RV32-NEXT:    sub a3, s10, a3
 ; RV32-NEXT:    sw zero, 48(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw zero, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    li s6, 0
+; RV32-NEXT:    sw zero, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    li s7, 0
-; RV32-NEXT:    lw a5, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    beqz a3, .LBB3_56
+; RV32-NEXT:    beqz a4, .LBB3_56
 ; RV32-NEXT:  .LBB3_45: # %udiv-do-while
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    srli a2, ra, 31
-; RV32-NEXT:    slli a3, s9, 1
-; RV32-NEXT:    or a6, a3, a2
-; RV32-NEXT:    srli a3, s11, 31
+; RV32-NEXT:    slli a4, s9, 1
+; RV32-NEXT:    or a4, a4, a2
+; RV32-NEXT:    srli a2, s11, 31
 ; RV32-NEXT:    slli ra, ra, 1
-; RV32-NEXT:    or ra, ra, a3
-; RV32-NEXT:    beq s5, a6, .LBB3_47
+; RV32-NEXT:    or ra, ra, a2
+; RV32-NEXT:    beq s6, a4, .LBB3_47
 ; RV32-NEXT:  # %bb.46: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    sltu a3, s5, a6
+; RV32-NEXT:    sltu a2, s6, a4
 ; RV32-NEXT:    j .LBB3_48
 ; RV32-NEXT:  .LBB3_47: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    lw a2, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    sltu a3, a2, ra
+; RV32-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sltu a2, a2, ra
 ; RV32-NEXT:  .LBB3_48: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    srli a4, s8, 31
+; RV32-NEXT:    srli a5, a3, 31
 ; RV32-NEXT:    slli s11, s11, 1
-; RV32-NEXT:    slli s8, s8, 1
-; RV32-NEXT:    or s11, s11, a4
+; RV32-NEXT:    slli a3, a3, 1
+; RV32-NEXT:    or s11, s11, a5
 ; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    or s10, s8, a0
-; RV32-NEXT:    beq a5, s11, .LBB3_50
+; RV32-NEXT:    or s10, a3, a0
+; RV32-NEXT:    beq s5, s11, .LBB3_50
 ; RV32-NEXT:  # %bb.49: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    sltu a0, a5, s11
+; RV32-NEXT:    sltu a0, s5, s11
 ; RV32-NEXT:    j .LBB3_51
 ; RV32-NEXT:  .LBB3_50: # in Loop: Header=BB3_45 Depth=1
 ; RV32-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    sltu a0, a0, s10
 ; RV32-NEXT:  .LBB3_51: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    lw a2, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    xor a4, a2, ra
-; RV32-NEXT:    xor a5, s5, a6
-; RV32-NEXT:    or a4, a4, a5
-; RV32-NEXT:    beqz a4, .LBB3_53
+; RV32-NEXT:    lw a3, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    xor a3, a3, ra
+; RV32-NEXT:    xor a5, s6, a4
+; RV32-NEXT:    or a3, a3, a5
+; RV32-NEXT:    beqz a3, .LBB3_53
 ; RV32-NEXT:  # %bb.52: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:  .LBB3_53: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    srli a3, s9, 31
-; RV32-NEXT:    lw a2, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    sub a3, a2, a3
-; RV32-NEXT:    sub a3, a3, a0
-; RV32-NEXT:    slli a0, a3, 31
+; RV32-NEXT:    srli a2, s9, 31
+; RV32-NEXT:    lw a3, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT:    sub a2, a3, a2
+; RV32-NEXT:    sub a2, a2, a0
+; RV32-NEXT:    slli a0, a2, 31
 ; RV32-NEXT:    srai a0, a0, 31
-; RV32-NEXT:    lw a3, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw a2, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    and s8, a0, a2
-; RV32-NEXT:    lw a5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    and a5, a0, a5
-; RV32-NEXT:    sltu a4, s10, s8
-; RV32-NEXT:    mv s9, a4
-; RV32-NEXT:    beq s11, a5, .LBB3_44
+; RV32-NEXT:    lw a3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a3, a0, a3
+; RV32-NEXT:    lw a2, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    sltu a5, s10, a3
+; RV32-NEXT:    mv s9, a5
+; RV32-NEXT:    beq s11, a2, .LBB3_44
 ; RV32-NEXT:  # %bb.54: # %udiv-do-while
 ; RV32-NEXT:    # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT:    sltu s9, s11, a5
+; RV32-NEXT:    sltu s9, s11, a2
 ; RV32-NEXT:    j .LBB3_44
 ; RV32-NEXT:  .LBB3_55:
 ; RV32-NEXT:    sw zero, 52(sp) # 4-byte Folded Spill
@@ -1785,18 +1784,18 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
 ; RV32-NEXT:    slli a0, s2, 1
 ; RV32-NEXT:    srli a3, s2, 31
 ; RV32-NEXT:    or a2, a0, a2
-; RV32-NEXT:    slli a0, s0, 1
+; RV32-NEXT:    slli a4, s0, 1
 ; RV32-NEXT:    srli s0, s0, 31
 ; RV32-NEXT:    slli t5, t5, 1
-; RV32-NEXT:    or a3, a0, a3
+; RV32-NEXT:    or a4, a4, a3
 ; RV32-NEXT:    lw a0, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT:    or a4, a0, t5
+; RV32-NEXT:    or a5, a0, t5
 ; RV32-NEXT:    lw s8, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:  .LBB3_57: # %udiv-end
-; RV32-NEXT:    sw a4, 0(s8)
+; RV32-NEXT:    sw a5, 0(s8)
 ; RV32-NEXT:    sw a1, 4(s8)
 ; RV32-NEXT:    sw a2, 8(s8)
-; RV32-NEXT:    sw a3, 12(s8)
+; RV32-NEXT:    sw a4, 12(s8)
 ; RV32-NEXT:    sb s0, 16(s8)
 ; RV32-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 232(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1a696d546a1a3..00f2e012c8b12 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,88 +152,77 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 32(%ebp), %ecx
-; X86-NEXT:    movl 36(%ebp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    subl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebp), %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl 48(%ebp), %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %ebx
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    sete %dl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
-; X86-NEXT:    orb %cl, %al
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    cmovnel %edx, %ebx
-; X86-NEXT:    orl $64, %ebx
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    bsrl %edi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovnel %esi, %ecx
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %esi
@@ -243,49 +232,68 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    subl %esi, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %select.false.sink
+; X86-NEXT:    movl $127, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %esi
+; X86-NEXT:  .LBB4_3: # %select.end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %edx
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %edi, %eax
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmovnel %edi, %esi
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    jne .LBB4_8
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl $127, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.10: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    je .LBB4_11
+; X86-NEXT:  # %bb.8: # %udiv-bb1
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
@@ -296,121 +304,112 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 152(%esp,%eax), %ebx
+; X86-NEXT:    movl 152(%esp,%eax), %edx
 ; X86-NEXT:    movl 156(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 144(%esp,%eax), %esi
 ; X86-NEXT:    movl 148(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl %cl, %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    jae .LBB4_5
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_5: # %udiv-preheader
+; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-preheader
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 108(%esp,%eax), %edi
+; X86-NEXT:    movl 108(%esp,%eax), %ebx
+; X86-NEXT:    movl 104(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%eax), %esi
+; X86-NEXT:    movl 100(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    shrl %cl, %ebx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %edi, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%eax), %edx
-; X86-NEXT:    movl 100(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shrl %cl, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB4_6: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    shldl $1, %ecx, %esi
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edi, %edi
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl $1, %edi
@@ -424,73 +423,70 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_6
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ebx
-; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    leal (%edi,%esi,2), %edi
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    subl %ecx, %edi
-; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl 56(%ebp), %ecx
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    leal (%edi,%edx,2), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:  .LBB4_11: # %udiv-end
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    subl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull 40(%ebp)
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
@@ -498,22 +494,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    imull %eax, %edi
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull 44(%ebp), %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    imull 44(%ebp), %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull %esi, %ecx
@@ -524,28 +521,38 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 32(%ebp), %edi
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movb $1, %cl
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_9:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB4_7
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB4_11
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7f5ede7a858d2..3d756f3cf2141 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,359 +152,362 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %ebx
 ; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl 52(%ebp), %edi
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %bl
+; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    orl 36(%ebp), %eax
 ; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl 40(%ebp), %ebx
 ; X86-NEXT:    xorl $31, %ebx
 ; X86-NEXT:    orl $32, %ebx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ebx
 ; X86-NEXT:    orl $64, %ebx
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    movl 36(%ebp), %esi
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl 32(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $127, %edx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %select.false.sink
+; X86-NEXT:    movl $127, %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovnel %eax, %edx
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    cmovnel %eax, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:  .LBB4_3: # %select.end
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%ebp), %esi
-; X86-NEXT:    jne .LBB4_6
-; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    cmovnel %ecx, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    cmovnel %ecx, %ebx
+; X86-NEXT:    jne .LBB4_9
+; X86-NEXT:  # %bb.4: # %select.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    je .LBB4_6
-; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    je .LBB4_9
+; X86-NEXT:  # %bb.5: # %udiv-bb1
 ; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 136(%esp,%eax), %edx
-; X86-NEXT:    movl 140(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esp,%eax), %esi
-; X86-NEXT:    movl 132(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    addl $1, %ebx
+; X86-NEXT:    movl 136(%esp,%eax), %esi
+; X86-NEXT:    movl 140(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    jae .LBB4_3
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_5
-; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movl 128(%esp,%eax), %edi
+; X86-NEXT:    movl 132(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jb .LBB4_10
+; X86-NEXT:  # %bb.6: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 36(%ebp), %edx
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %edi
-; X86-NEXT:    movl 88(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl %cl, %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl 92(%esp,%eax), %esi
+; X86-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    shrdl %cl, %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %edx
 ; X86-NEXT:    movl 84(%esp,%eax), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shrl %cl, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shrdl %cl, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
-; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebp), %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebp), %eax
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    addl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB4_4: # %udiv-do-while
+; X86-NEXT:  .LBB4_7: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 52(%ebp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 48(%ebp), %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl 48(%ebp), %esi
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    andl 44(%ebp), %ebx
 ; X86-NEXT:    andl 40(%ebp), %ecx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    subl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jne .LBB4_4
-; X86-NEXT:  .LBB4_5: # %udiv-loop-exit
+; X86-NEXT:    jne .LBB4_7
+; X86-NEXT:  .LBB4_8: # %udiv-loop-exit
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    shldl $1, %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edx
-; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    shldl $1, %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl $1, %esi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal (%eax,%esi,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl 56(%ebp), %esi
-; X86-NEXT:  .LBB4_6: # %udiv-end
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %edi, 4(%esi)
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    leal (%esi,%edx,2), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:  .LBB4_9: # %udiv-end
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl 48(%ebp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 52(%ebp), %eax
-; X86-NEXT:    imull %edi, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %esi
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 56(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 40(%ebp), %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl 32(%ebp), %esi
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl 36(%ebp), %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_10:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jmp .LBB4_8
 ;
 ; X64-LABEL: scalar_i128:
 ; X64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 412455384e937..b3cb7401e6402 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,10 +22,10 @@ define void @f() nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $160, %esp
+; X86-NEXT:    subl $176, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %ecx
 ; X86-NEXT:    movzbl %al, %eax
@@ -37,17 +37,18 @@ define void @f() nounwind {
 ; X86-NEXT:    sarl $30, %ecx
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    shrdl $1, %eax, %ecx
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    shldl $30, %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $30, %ebx, %ecx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    shldl $30, %esi, %edx
 ; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %BB_udiv-special-cases
 ; X86-NEXT:    bsrl %edx, %eax
@@ -75,37 +76,50 @@ define void @f() nounwind {
 ; X86-NEXT:    addl $64, %esi
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:  .LBB0_8: # %BB_udiv-special-cases
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    addl $-66, %eax
 ; X86-NEXT:    movl $0, %ebx
 ; X86-NEXT:    adcl $-1, %ebx
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    adcl $3, %edx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    adcl $3, %esi
+; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    movb $1, %cl
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    jne .LBB0_14
-; X86-NEXT:  # %bb.9: # %BB_udiv-special-cases
-; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    jne .LBB0_10
+; X86-NEXT:  # %bb.9: # %select.false.sink
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl $65, %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    setb %cl
+; X86-NEXT:  .LBB0_10: # %select.end
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    jne .LBB0_15
+; X86-NEXT:  # %bb.11: # %select.end
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $65, %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    je .LBB0_14
-; X86-NEXT:  # %bb.10: # %udiv-bb1
+; X86-NEXT:    je .LBB0_15
+; X86-NEXT:  # %bb.12: # %udiv-bb1
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl $1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    andl $3, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb $65, %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
+; X86-NEXT:    movsbl %al, %edx
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -114,24 +128,23 @@ define void @f() nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 128(%esp,%edx), %eax
+; X86-NEXT:    movl 132(%esp,%edx), %edi
+; X86-NEXT:    movl 136(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%esp,%esi), %edi
-; X86-NEXT:    movl 116(%esp,%esi), %eax
-; X86-NEXT:    movl 120(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    je .LBB0_13
-; X86-NEXT:  # %bb.11: # %udiv-preheader
-; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    je .LBB0_15
+; X86-NEXT:  # %bb.13: # %udiv-preheader
+; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -146,15 +159,14 @@ define void @f() nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 72(%esp,%eax), %ebx
-; X86-NEXT:    movl 64(%esp,%eax), %esi
-; X86-NEXT:    movl 68(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrdl %cl, %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl 88(%esp,%eax), %edi
+; X86-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NEXT:    movl 84(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shrdl %cl, %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %ebx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
@@ -162,78 +174,77 @@ define void @f() nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $3, %edi
-; X86-NEXT:    andl $3, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $3, %esi
+; X86-NEXT:    andl $3, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB0_12: # %udiv-do-while
+; X86-NEXT:  .LBB0_14: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl $1, %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    andl $2, %edx
-; X86-NEXT:    shrl %edx
-; X86-NEXT:    leal (%edx,%ebx,2), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %edi
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    andl $2, %esi
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    leal (%esi,%ebx,2), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, %edi
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    andl $3, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    shll $30, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sarl $30, %edi
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    shrdl $1, %edx, %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    shll $30, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sarl $30, %esi
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    shrdl $1, %edi, %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    subl %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $3, %edi
-; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $3, %ebx
+; X86-NEXT:    andl $3, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    jne .LBB0_12
-; X86-NEXT:  .LBB0_13: # %udiv-loop-exit
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB0_14: # %udiv-end
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    jne .LBB0_14
+; X86-NEXT:  .LBB0_15: # %udiv-end
 ; X86-NEXT:    cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%eax)
 ; X86-NEXT:    movb $0, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
index fc823cd543144..751bdbade15d9 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=x86_64-- -expand-ir-insts -expand-div-rem-bits 128 < %s | FileCheck %s
 ; RUN: opt -S -mtriple=x86_64-- -passes='require<libcall-lowering-info>,expand-ir-insts' -expand-div-rem-bits 128 < %s | FileCheck %s
 
-define void @sdiv129(ptr %ptr, ptr %out) nounwind {
+define void @sdiv129(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @sdiv129(
 ; CHECK-NEXT:  _udiv-special-cases:
 ; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
@@ -24,11 +24,11 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP10]], i1 true)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub i129 [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i129 [[TMP16]], 128
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP17]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i129 [[TMP16]], 128
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i129 0, i129 [[TMP10]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP18]], i1 true, i1 [[TMP19]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP18]], i129 0, i129 [[TMP10]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP18]], i1 true, i1 [[TMP19]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP22:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP37:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi i129 [ [[TMP46:%.*]], [[UDIV_BB1]] ], [ [[TMP34:%.*]], [[UDIV_DO_WHILE]] ]
@@ -52,7 +52,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP39]] = sub i129 [[TMP32]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP40]] = add i129 [[TMP27]], -1
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i129 [[TMP40]], 0
-; CHECK-NEXT:    br i1 [[TMP41]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP41]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP42]] = lshr i129 [[TMP10]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP43]] = add i129 [[TMP9]], -1
@@ -62,7 +62,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP45:%.*]] = sub i129 128, [[TMP16]]
 ; CHECK-NEXT:    [[TMP46]] = shl i129 [[TMP10]], [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i129 [[TMP44]], 0
-; CHECK-NEXT:    br i1 [[TMP47]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP47]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
@@ -75,3 +75,13 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
   store i129 %res, ptr %out
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF2]] = !{!"unknown", !"integer-division"}
+;.
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
index 667152228d258..45491ccda2b19 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=x86_64-- -expand-ir-insts -expand-div-rem-bits 128 < %s | FileCheck %s
 ; RUN: opt -S -mtriple=x86_64-- -passes='require<libcall-lowering-info>,expand-ir-insts' -expand-div-rem-bits 128 < %s | FileCheck %s
 
-define void @test(ptr %ptr, ptr %out) nounwind {
+define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
 ; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
@@ -25,11 +25,11 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP11]], i1 true)
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub i129 [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ugt i129 [[TMP17]], 128
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP14]], i1 true, i1 [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP14]], i1 true, i1 [[TMP18]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i129 [[TMP17]], 128
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], i129 0, i129 [[TMP11]]
-; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP20]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], i129 0, i129 [[TMP11]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP19]], i1 true, i1 [[TMP20]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP38:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi i129 [ [[TMP47:%.*]], [[UDIV_BB1]] ], [ [[TMP35:%.*]], [[UDIV_DO_WHILE]] ]
@@ -53,7 +53,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP40]] = sub i129 [[TMP33]], [[TMP39]]
 ; CHECK-NEXT:    [[TMP41]] = add i129 [[TMP28]], -1
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i129 [[TMP41]], 0
-; CHECK-NEXT:    br i1 [[TMP42]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP43]] = lshr i129 [[TMP11]], [[TMP45]]
 ; CHECK-NEXT:    [[TMP44]] = add i129 [[TMP10]], -1
@@ -63,7 +63,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub i129 128, [[TMP17]]
 ; CHECK-NEXT:    [[TMP47]] = shl i129 [[TMP11]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = icmp eq i129 [[TMP45]], 0
-; CHECK-NEXT:    br i1 [[TMP48]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP48]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP49:%.*]] = phi i129 [ [[TMP26]], [[UDIV_LOOP_EXIT]] ], [ [[TMP21]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i129 [[TMP9]], [[TMP49]]
@@ -78,3 +78,13 @@ define void @test(ptr %ptr, ptr %out) nounwind {
   store i129 %res, ptr %out
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF2]] = !{!"unknown", !"integer-division"}
+;.
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
index b2b83815f79b0..6ad696ae446fd 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=x86_64-- -expand-ir-insts -expand-div-rem-bits 128 < %s | FileCheck %s
 ; RUN: opt -S -mtriple=x86_64-- -passes='require<libcall-lowering-info>,expand-ir-insts' -expand-div-rem-bits 128 < %s | FileCheck %s
 
-define void @test(ptr %ptr, ptr %out) nounwind {
+define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
 ; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
@@ -15,11 +15,11 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP1]], i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i129 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i129 [[TMP7]], 128
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP4]], i1 true, i1 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP4]], i1 true, i1 [[TMP8]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i129 [[TMP7]], 128
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i129 0, i129 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP9]], i1 true, i1 [[TMP10]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i129 0, i129 [[TMP1]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP9]], i1 true, i1 [[TMP10]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP28:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi i129 [ [[TMP37:%.*]], [[UDIV_BB1]] ], [ [[TMP25:%.*]], [[UDIV_DO_WHILE]] ]
@@ -43,7 +43,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP30]] = sub i129 [[TMP23]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31]] = add i129 [[TMP18]], -1
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i129 [[TMP31]], 0
-; CHECK-NEXT:    br i1 [[TMP32]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP33]] = lshr i129 [[TMP1]], [[TMP35]]
 ; CHECK-NEXT:    [[TMP34]] = add i129 [[TMP0]], -1
@@ -53,7 +53,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub i129 128, [[TMP7]]
 ; CHECK-NEXT:    [[TMP37]] = shl i129 [[TMP1]], [[TMP36]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i129 [[TMP35]], 0
-; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 16
@@ -64,3 +64,13 @@ define void @test(ptr %ptr, ptr %out) nounwind {
   store i129 %res, ptr %out
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF2]] = !{!"unknown", !"integer-division"}
+;.
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
index 46e72001b2c2d..a4c4ac2cba329 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -mtriple=x86_64-- -expand-ir-insts -expand-div-rem-bits 128 < %s | FileCheck %s
 ; RUN: opt -S -mtriple=x86_64-- -passes='require<libcall-lowering-info>,expand-ir-insts' -expand-div-rem-bits 128 < %s | FileCheck %s
 
-define void @test(ptr %ptr, ptr %out) nounwind {
+define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
 ; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
@@ -17,11 +17,11 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub i129 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i129 [[TMP9]], 128
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], i1 true, i1 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], i1 true, i1 [[TMP10]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i129 [[TMP9]], 128
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], i129 0, i129 [[TMP3]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP12]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], i129 0, i129 [[TMP3]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP12]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP30:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i129 [ [[TMP39:%.*]], [[UDIV_BB1]] ], [ [[TMP27:%.*]], [[UDIV_DO_WHILE]] ]
@@ -45,7 +45,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP32]] = sub i129 [[TMP25]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP33]] = add i129 [[TMP20]], -1
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i129 [[TMP33]], 0
-; CHECK-NEXT:    br i1 [[TMP34]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP35]] = lshr i129 [[TMP3]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP36]] = add i129 [[TMP2]], -1
@@ -55,7 +55,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP38:%.*]] = sub i129 128, [[TMP9]]
 ; CHECK-NEXT:    [[TMP39]] = shl i129 [[TMP3]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i129 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP40]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
@@ -68,3 +68,13 @@ define void @test(ptr %ptr, ptr %out) nounwind {
   store i129 %res, ptr %out
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF2]] = !{!"unknown", !"integer-division"}
+;.
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/vector.ll b/llvm/test/Transforms/ExpandIRInsts/X86/vector.ll
index 58e74b8d17b55..727e59b5bdef0 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/vector.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/vector.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
 ; RUN: opt -S -mtriple=x86_64-- -expand-ir-insts -expand-div-rem-bits 128 < %s | FileCheck %s
 ; RUN: opt -S -mtriple=x86_64-- -passes='require<libcall-lowering-info>,expand-ir-insts' -expand-div-rem-bits 128 < %s | FileCheck %s
 
-define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
+define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind !prof !0 {
 ; CHECK-LABEL: define <2 x i129> @sdiv129(
-; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: <2 x i129> [[A:%.*]], <2 x i129> [[B:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  _udiv-special-cases_udiv-special-cases:
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i129> [[A]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i129> [[B]], i64 0
@@ -26,11 +26,11 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP12]], i1 true)
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub i129 [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i129 [[TMP18]], 128
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP15]], i1 true, i1 [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP15]], i1 true, i1 [[TMP19]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i129 [[TMP18]], 128
-; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP20]], i129 0, i129 [[TMP12]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP21]]
-; CHECK-NEXT:    br i1 [[TMP23]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP20]], i129 0, i129 [[TMP12]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP20]], i1 true, i1 [[TMP21]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit2:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP39:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = phi i129 [ [[TMP48:%.*]], [[UDIV_BB15]] ], [ [[TMP36:%.*]], [[UDIV_DO_WHILE3]] ]
@@ -54,7 +54,7 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP41]] = sub i129 [[TMP34]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42]] = add i129 [[TMP29]], -1
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i129 [[TMP42]], 0
-; CHECK-NEXT:    br i1 [[TMP43]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK-NEXT:    br i1 [[TMP43]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader4:
 ; CHECK-NEXT:    [[TMP44]] = lshr i129 [[TMP12]], [[TMP46]]
 ; CHECK-NEXT:    [[TMP45]] = add i129 [[TMP11]], -1
@@ -64,7 +64,7 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP47:%.*]] = sub i129 128, [[TMP18]]
 ; CHECK-NEXT:    [[TMP48]] = shl i129 [[TMP12]], [[TMP47]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i129 [[TMP46]], 0
-; CHECK-NEXT:    br i1 [[TMP49]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]], !prof [[PROF1]]
 ; CHECK:       udiv-end1:
 ; CHECK-NEXT:    [[TMP50:%.*]] = phi i129 [ [[TMP27]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP22]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP51:%.*]] = xor i129 [[TMP50]], [[TMP10]]
@@ -90,11 +90,11 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP71:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP66]], i1 true)
 ; CHECK-NEXT:    [[TMP72:%.*]] = sub i129 [[TMP70]], [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = icmp ugt i129 [[TMP72]], 128
-; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP69]], i1 true, i1 [[TMP73]]
+; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP69]], i1 true, i1 [[TMP73]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i129 [[TMP72]], 128
-; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP74]], i129 0, i129 [[TMP66]]
-; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP74]], i1 true, i1 [[TMP75]]
-; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP74]], i129 0, i129 [[TMP66]], !prof [[PROF2]]
+; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP74]], i1 true, i1 [[TMP75]], !prof [[PROF2]]
+; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP78:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP93:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP79:%.*]] = phi i129 [ [[TMP102:%.*]], [[UDIV_BB1]] ], [ [[TMP90:%.*]], [[UDIV_DO_WHILE]] ]
@@ -118,7 +118,7 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP95]] = sub i129 [[TMP88]], [[TMP94]]
 ; CHECK-NEXT:    [[TMP96]] = add i129 [[TMP83]], -1
 ; CHECK-NEXT:    [[TMP97:%.*]] = icmp eq i129 [[TMP96]], 0
-; CHECK-NEXT:    br i1 [[TMP97]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP97]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP98]] = lshr i129 [[TMP66]], [[TMP100]]
 ; CHECK-NEXT:    [[TMP99]] = add i129 [[TMP65]], -1
@@ -128,7 +128,7 @@ define <2 x i129> @sdiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP101:%.*]] = sub i129 128, [[TMP72]]
 ; CHECK-NEXT:    [[TMP102]] = shl i129 [[TMP66]], [[TMP101]]
 ; CHECK-NEXT:    [[TMP103:%.*]] = icmp eq i129 [[TMP100]], 0
-; CHECK-NEXT:    br i1 [[TMP103]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP103]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP104:%.*]] = phi i129 [ [[TMP81]], [[UDIV_LOOP_EXIT]] ], [ [[TMP76]], [[UDIV_END1]] ]
 ; CHECK-NEXT:    [[TMP105:%.*]] = xor i129 [[TMP104]], [[TMP64]]
@@ -155,11 +155,11 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub i129 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i129 [[TMP9]], 128
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], i1 true, i1 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP6]], i1 true, i1 [[TMP10]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i129 [[TMP9]], 128
 ; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP11]], i129 0, i129 [[TMP3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i1 true, i1 [[TMP12]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit2:
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP30:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi i129 [ [[TMP39:%.*]], [[UDIV_BB15]] ], [ [[TMP27:%.*]], [[UDIV_DO_WHILE3]] ]
@@ -183,7 +183,7 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP32]] = sub i129 [[TMP25]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP33]] = add i129 [[TMP20]], -1
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i129 [[TMP33]], 0
-; CHECK-NEXT:    br i1 [[TMP34]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader4:
 ; CHECK-NEXT:    [[TMP35]] = lshr i129 [[TMP3]], [[TMP37]]
 ; CHECK-NEXT:    [[TMP36]] = add i129 [[TMP2]], -1
@@ -193,7 +193,7 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP38:%.*]] = sub i129 128, [[TMP9]]
 ; CHECK-NEXT:    [[TMP39]] = shl i129 [[TMP3]], [[TMP38]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i129 [[TMP37]], 0
-; CHECK-NEXT:    br i1 [[TMP40]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]], !prof [[PROF1]]
 ; CHECK:       udiv-end1:
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <2 x i129> poison, i129 [[TMP41]], i64 0
@@ -208,11 +208,11 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP51:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP46]], i1 true)
 ; CHECK-NEXT:    [[TMP52:%.*]] = sub i129 [[TMP50]], [[TMP51]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = icmp ugt i129 [[TMP52]], 128
-; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP49]], i1 true, i1 [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP49]], i1 true, i1 [[TMP53]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i129 [[TMP52]], 128
 ; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i129 0, i129 [[TMP46]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP54]], i1 true, i1 [[TMP55]]
-; CHECK-NEXT:    br i1 [[TMP57]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP57]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP58:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP73:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP59:%.*]] = phi i129 [ [[TMP82:%.*]], [[UDIV_BB1]] ], [ [[TMP70:%.*]], [[UDIV_DO_WHILE]] ]
@@ -236,7 +236,7 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP75]] = sub i129 [[TMP68]], [[TMP74]]
 ; CHECK-NEXT:    [[TMP76]] = add i129 [[TMP63]], -1
 ; CHECK-NEXT:    [[TMP77:%.*]] = icmp eq i129 [[TMP76]], 0
-; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP77]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP78]] = lshr i129 [[TMP46]], [[TMP80]]
 ; CHECK-NEXT:    [[TMP79]] = add i129 [[TMP45]], -1
@@ -246,7 +246,7 @@ define <2 x i129> @udiv129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP81:%.*]] = sub i129 128, [[TMP52]]
 ; CHECK-NEXT:    [[TMP82]] = shl i129 [[TMP46]], [[TMP81]]
 ; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i129 [[TMP80]], 0
-; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP84:%.*]] = phi i129 [ [[TMP61]], [[UDIV_LOOP_EXIT]] ], [ [[TMP56]], [[UDIV_END1]] ]
 ; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <2 x i129> [[TMP42]], i129 [[TMP84]], i64 1
@@ -281,11 +281,11 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[TMP19:%.*]] = sub i129 [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ugt i129 [[TMP19]], 128
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i1 true, i1 [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i1 true, i1 [[TMP20]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i129 [[TMP19]], 128
 ; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP21]], i129 0, i129 [[TMP13]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP21]], i1 true, i1 [[TMP22]]
-; CHECK-NEXT:    br i1 [[TMP24]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit2:
 ; CHECK-NEXT:    [[TMP25:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP40:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi i129 [ [[TMP49:%.*]], [[UDIV_BB15]] ], [ [[TMP37:%.*]], [[UDIV_DO_WHILE3]] ]
@@ -309,7 +309,7 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP42]] = sub i129 [[TMP35]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43]] = add i129 [[TMP30]], -1
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i129 [[TMP43]], 0
-; CHECK-NEXT:    br i1 [[TMP44]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK-NEXT:    br i1 [[TMP44]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader4:
 ; CHECK-NEXT:    [[TMP45]] = lshr i129 [[TMP13]], [[TMP47]]
 ; CHECK-NEXT:    [[TMP46]] = add i129 [[TMP12]], -1
@@ -319,7 +319,7 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP48:%.*]] = sub i129 128, [[TMP19]]
 ; CHECK-NEXT:    [[TMP49]] = shl i129 [[TMP13]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i129 [[TMP47]], 0
-; CHECK-NEXT:    br i1 [[TMP50]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK-NEXT:    br i1 [[TMP50]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]], !prof [[PROF1]]
 ; CHECK:       udiv-end1:
 ; CHECK-NEXT:    [[TMP51:%.*]] = phi i129 [ [[TMP28]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP23]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP52:%.*]] = mul i129 [[TMP11]], [[TMP51]]
@@ -348,11 +348,11 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP75:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP70]], i1 true)
 ; CHECK-NEXT:    [[TMP76:%.*]] = sub i129 [[TMP74]], [[TMP75]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = icmp ugt i129 [[TMP76]], 128
-; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP73]], i1 true, i1 [[TMP77]]
+; CHECK-NEXT:    [[TMP78:%.*]] = select i1 [[TMP73]], i1 true, i1 [[TMP77]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = icmp eq i129 [[TMP76]], 128
 ; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP78]], i129 0, i129 [[TMP70]]
 ; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP78]], i1 true, i1 [[TMP79]]
-; CHECK-NEXT:    br i1 [[TMP81]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP81]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP82:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP97:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP83:%.*]] = phi i129 [ [[TMP106:%.*]], [[UDIV_BB1]] ], [ [[TMP94:%.*]], [[UDIV_DO_WHILE]] ]
@@ -376,7 +376,7 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP99]] = sub i129 [[TMP92]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP100]] = add i129 [[TMP87]], -1
 ; CHECK-NEXT:    [[TMP101:%.*]] = icmp eq i129 [[TMP100]], 0
-; CHECK-NEXT:    br i1 [[TMP101]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP101]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP102]] = lshr i129 [[TMP70]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP103]] = add i129 [[TMP69]], -1
@@ -386,7 +386,7 @@ define <2 x i129> @srem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP105:%.*]] = sub i129 128, [[TMP76]]
 ; CHECK-NEXT:    [[TMP106]] = shl i129 [[TMP70]], [[TMP105]]
 ; CHECK-NEXT:    [[TMP107:%.*]] = icmp eq i129 [[TMP104]], 0
-; CHECK-NEXT:    br i1 [[TMP107]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP107]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP108:%.*]] = phi i129 [ [[TMP85]], [[UDIV_LOOP_EXIT]] ], [ [[TMP80]], [[UDIV_END1]] ]
 ; CHECK-NEXT:    [[TMP109:%.*]] = mul i129 [[TMP68]], [[TMP108]]
@@ -417,11 +417,11 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP5]], i1 true)
 ; CHECK-NEXT:    [[TMP11:%.*]] = sub i129 [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt i129 [[TMP11]], 128
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP8]], i1 true, i1 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP8]], i1 true, i1 [[TMP12]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i129 [[TMP11]], 128
 ; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i129 0, i129 [[TMP5]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP13]], i1 true, i1 [[TMP14]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[UDIV_END1:%.*]], label [[UDIV_BB15:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit2:
 ; CHECK-NEXT:    [[TMP17:%.*]] = phi i129 [ 0, [[UDIV_BB15]] ], [ [[TMP32:%.*]], [[UDIV_DO_WHILE3:%.*]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi i129 [ [[TMP41:%.*]], [[UDIV_BB15]] ], [ [[TMP29:%.*]], [[UDIV_DO_WHILE3]] ]
@@ -445,7 +445,7 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP34]] = sub i129 [[TMP27]], [[TMP33]]
 ; CHECK-NEXT:    [[TMP35]] = add i129 [[TMP22]], -1
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i129 [[TMP35]], 0
-; CHECK-NEXT:    br i1 [[TMP36]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[UDIV_LOOP_EXIT2:%.*]], label [[UDIV_DO_WHILE3]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader4:
 ; CHECK-NEXT:    [[TMP37]] = lshr i129 [[TMP5]], [[TMP39]]
 ; CHECK-NEXT:    [[TMP38]] = add i129 [[TMP4]], -1
@@ -455,7 +455,7 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP40:%.*]] = sub i129 128, [[TMP11]]
 ; CHECK-NEXT:    [[TMP41]] = shl i129 [[TMP5]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i129 [[TMP39]], 0
-; CHECK-NEXT:    br i1 [[TMP42]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[UDIV_LOOP_EXIT2]], label [[UDIV_PREHEADER4]], !prof [[PROF1]]
 ; CHECK:       udiv-end1:
 ; CHECK-NEXT:    [[TMP43:%.*]] = phi i129 [ [[TMP20]], [[UDIV_LOOP_EXIT2]] ], [ [[TMP15]], [[_UDIV_SPECIAL_CASES_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = mul i129 [[TMP3]], [[TMP43]]
@@ -474,11 +474,11 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP57:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP52]], i1 true)
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub i129 [[TMP56]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = icmp ugt i129 [[TMP58]], 128
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP55]], i1 true, i1 [[TMP59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP55]], i1 true, i1 [[TMP59]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp eq i129 [[TMP58]], 128
 ; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP60]], i129 0, i129 [[TMP52]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP60]], i1 true, i1 [[TMP61]]
-; CHECK-NEXT:    br i1 [[TMP63]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP63]], label [[UDIV_END:%.*]], label [[UDIV_BB1:%.*]], !prof [[PROF1]]
 ; CHECK:       udiv-loop-exit:
 ; CHECK-NEXT:    [[TMP64:%.*]] = phi i129 [ 0, [[UDIV_BB1]] ], [ [[TMP79:%.*]], [[UDIV_DO_WHILE:%.*]] ]
 ; CHECK-NEXT:    [[TMP65:%.*]] = phi i129 [ [[TMP88:%.*]], [[UDIV_BB1]] ], [ [[TMP76:%.*]], [[UDIV_DO_WHILE]] ]
@@ -502,7 +502,7 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP81]] = sub i129 [[TMP74]], [[TMP80]]
 ; CHECK-NEXT:    [[TMP82]] = add i129 [[TMP69]], -1
 ; CHECK-NEXT:    [[TMP83:%.*]] = icmp eq i129 [[TMP82]], 0
-; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]]
+; CHECK-NEXT:    br i1 [[TMP83]], label [[UDIV_LOOP_EXIT:%.*]], label [[UDIV_DO_WHILE]], !prof [[PROF1]]
 ; CHECK:       udiv-preheader:
 ; CHECK-NEXT:    [[TMP84]] = lshr i129 [[TMP52]], [[TMP86]]
 ; CHECK-NEXT:    [[TMP85]] = add i129 [[TMP51]], -1
@@ -512,7 +512,7 @@ define <2 x i129> @urem129(<2 x i129> %a, <2 x i129> %b) nounwind {
 ; CHECK-NEXT:    [[TMP87:%.*]] = sub i129 128, [[TMP58]]
 ; CHECK-NEXT:    [[TMP88]] = shl i129 [[TMP52]], [[TMP87]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = icmp eq i129 [[TMP86]], 0
-; CHECK-NEXT:    br i1 [[TMP89]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
+; CHECK-NEXT:    br i1 [[TMP89]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP90:%.*]] = phi i129 [ [[TMP67]], [[UDIV_LOOP_EXIT]] ], [ [[TMP62]], [[UDIV_END1]] ]
 ; CHECK-NEXT:    [[TMP91:%.*]] = mul i129 [[TMP50]], [[TMP90]]
@@ -534,3 +534,13 @@ define <vscale x 2 x i129> @sdiv129_scalable(<vscale x 2 x i129> %a, <vscale x 2
   %res = sdiv <vscale x 2 x i129> %a, %b
   ret <vscale x 2 x i129> %res
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF2]] = !{!"unknown", !"integer-division"}
+;.



More information about the llvm-commits mailing list