[llvm] 0cdc1b6 - [SelectionDAG] Support integer types with multiple registers in ComputePHILiveOutRegInfo. (#172081)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Dec 13 13:24:47 PST 2025
Author: Craig Topper
Date: 2025-12-13T13:24:41-08:00
New Revision: 0cdc1b6dd4a870fc41d4b15ad97e0001882aba58
URL: https://github.com/llvm/llvm-project/commit/0cdc1b6dd4a870fc41d4b15ad97e0001882aba58
DIFF: https://github.com/llvm/llvm-project/commit/0cdc1b6dd4a870fc41d4b15ad97e0001882aba58.diff
LOG: [SelectionDAG] Support integer types with multiple registers in ComputePHILiveOutRegInfo. (#172081)
PHIs that are larger than a legal integer type are split into multiple
virtual registers that are numbered sequentially. We can propagate the
known bits for each of these registers individually.
Big endian is not supported yet because the register order needs to be
reversed.
Fixes #171671
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
llvm/test/CodeGen/AMDGPU/div_i128.ll
llvm/test/CodeGen/AMDGPU/div_v2i128.ll
llvm/test/CodeGen/AMDGPU/rem_i128.ll
llvm/test/CodeGen/ARM/cttz.ll
llvm/test/CodeGen/NVPTX/i128.ll
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
llvm/test/CodeGen/RISCV/idiv_large.ll
llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
llvm/test/CodeGen/RISCV/rv32zbb.ll
llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
llvm/test/CodeGen/X86/bsf.ll
llvm/test/CodeGen/X86/bsr.ll
llvm/test/CodeGen/X86/ctlo.ll
llvm/test/CodeGen/X86/ctlz.ll
llvm/test/CodeGen/X86/cttz.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/X86/scheduler-backtracking.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index f4edbc74a3cc4..e73743ecbc9fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -451,7 +451,9 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
"PHIs with non-vector integer types should have a single VT.");
EVT IntVT = ValueVTs[0];
- if (TLI->getNumRegisters(PN->getContext(), IntVT) != 1)
+ unsigned NumRegisters = TLI->getNumRegisters(PN->getContext(), IntVT);
+ // FIXME: Support multiple registers for big endian targets.
+ if (NumRegisters != 1 && MF->getDataLayout().isBigEndian())
return;
IntVT = TLI->getRegisterType(PN->getContext(), IntVT);
unsigned BitWidth = IntVT.getSizeInBits();
@@ -460,81 +462,95 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
if (It == ValueMap.end())
return;
- Register DestReg = It->second;
- if (DestReg == 0)
+ Register BaseReg = It->second;
+ if (!BaseReg)
return;
- assert(DestReg.isVirtual() && "Expected a virtual reg");
- LiveOutRegInfo.grow(DestReg);
- LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
-
- Value *V = PN->getIncomingValue(0);
- if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
- DestLOI.NumSignBits = 1;
- DestLOI.Known = KnownBits(BitWidth);
- return;
- }
-
- if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
- APInt Val;
- if (TLI->signExtendConstant(CI))
- Val = CI->getValue().sext(BitWidth);
- else
- Val = CI->getValue().zext(BitWidth);
- DestLOI.NumSignBits = Val.getNumSignBits();
- DestLOI.Known = KnownBits::makeConstant(Val);
- } else {
- assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
- "CopyToReg node was created.");
- Register SrcReg = ValueMap[V];
- if (!SrcReg.isVirtual()) {
- DestLOI.IsValid = false;
- return;
- }
- const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
- if (!SrcLOI) {
- DestLOI.IsValid = false;
- return;
- }
- DestLOI = *SrcLOI;
- }
+ assert(BaseReg.isVirtual() && "Expected a virtual reg");
- assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
- DestLOI.Known.One.getBitWidth() == BitWidth &&
- "Masks should have the same bit width as the type.");
+ for (unsigned RegIdx = 0; RegIdx < NumRegisters; ++RegIdx) {
+ // Split registers are assigned sequentially.
+ Register DestReg = BaseReg.id() + RegIdx;
+ LiveOutRegInfo.grow(DestReg);
+ LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
- for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *V = PN->getIncomingValue(i);
+ Value *V = PN->getIncomingValue(0);
if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
DestLOI.NumSignBits = 1;
DestLOI.Known = KnownBits(BitWidth);
- return;
+ continue;
}
if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
APInt Val;
if (TLI->signExtendConstant(CI))
- Val = CI->getValue().sext(BitWidth);
+ Val = CI->getValue().sext(BitWidth * NumRegisters);
else
- Val = CI->getValue().zext(BitWidth);
- DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
- DestLOI.Known = DestLOI.Known.intersectWith(KnownBits::makeConstant(Val));
- continue;
+ Val = CI->getValue().zext(BitWidth * NumRegisters);
+ APInt Extracted = Val.extractBits(BitWidth, BitWidth * RegIdx);
+ DestLOI.NumSignBits = Extracted.getNumSignBits();
+ DestLOI.Known = KnownBits::makeConstant(Extracted);
+ } else {
+ assert(ValueMap.count(V) &&
+ "V should have been placed in ValueMap when its"
+ "CopyToReg node was created.");
+ Register SrcReg = ValueMap[V];
+ if (!SrcReg.isVirtual()) {
+ DestLOI.IsValid = false;
+ continue;
+ }
+ // Split registers are assigned sequentially.
+ SrcReg = SrcReg.id() + RegIdx;
+ const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+ if (!SrcLOI) {
+ DestLOI.IsValid = false;
+ continue;
+ }
+ DestLOI = *SrcLOI;
}
- assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
- "its CopyToReg node was created.");
- Register SrcReg = ValueMap[V];
- if (!SrcReg.isVirtual()) {
- DestLOI.IsValid = false;
- return;
- }
- const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
- if (!SrcLOI) {
- DestLOI.IsValid = false;
- return;
+ assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
+ DestLOI.Known.One.getBitWidth() == BitWidth &&
+ "Masks should have the same bit width as the type.");
+
+ for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *V = PN->getIncomingValue(i);
+ if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+ DestLOI.NumSignBits = 1;
+ DestLOI.Known = KnownBits(BitWidth);
+ break;
+ }
+
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+ APInt Val;
+ if (TLI->signExtendConstant(CI))
+ Val = CI->getValue().sext(BitWidth * NumRegisters);
+ else
+ Val = CI->getValue().zext(BitWidth * NumRegisters);
+ APInt Extracted = Val.extractBits(BitWidth, BitWidth * RegIdx);
+ DestLOI.NumSignBits =
+ std::min(DestLOI.NumSignBits, Extracted.getNumSignBits());
+ DestLOI.Known =
+ DestLOI.Known.intersectWith(KnownBits::makeConstant(Extracted));
+ continue;
+ }
+
+ assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
+ "its CopyToReg node was created.");
+ Register SrcReg = ValueMap[V];
+ if (!SrcReg.isVirtual()) {
+ DestLOI.IsValid = false;
+ break;
+ }
+ // Split registers are assigned sequentially.
+ SrcReg = SrcReg.id() + RegIdx;
+ const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+ if (!SrcLOI) {
+ DestLOI.IsValid = false;
+ break;
+ }
+ DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
+ DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known);
}
- DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
- DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index dd9a013d37203..d5b5ab6e457f9 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -14,7 +14,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_ashrrev_i32_e32 v16, 31, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc
@@ -24,13 +24,13 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v21, v4, v0, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v21, v5, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v22, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc
-; GFX9-NEXT: v_or_b32_e32 v3, v20, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v21, v0
+; GFX9-NEXT: v_or_b32_e32 v3, v21, v1
+; GFX9-NEXT: v_or_b32_e32 v2, v22, v0
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v3, v9, v11
; GFX9-NEXT: v_or_b32_e32 v2, v8, v10
@@ -39,9 +39,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v2, 32, v2
; GFX9-NEXT: v_ffbh_u32_e32 v3, v1
; GFX9-NEXT: v_min_u32_e32 v2, v2, v3
-; GFX9-NEXT: v_ffbh_u32_e32 v3, v21
+; GFX9-NEXT: v_ffbh_u32_e32 v3, v22
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT: v_ffbh_u32_e32 v4, v20
+; GFX9-NEXT: v_ffbh_u32_e32 v4, v21
; GFX9-NEXT: v_min_u32_e32 v3, v3, v4
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3
@@ -69,18 +69,18 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v18, v16
+; GFX9-NEXT: v_mov_b32_e32 v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v19, v17
+; GFX9-NEXT: v_mov_b32_e32 v20, v18
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2
-; GFX9-NEXT: v_or_b32_e32 v7, v3, v5
; GFX9-NEXT: v_or_b32_e32 v6, v6, v4
+; GFX9-NEXT: v_or_b32_e32 v7, v3, v5
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
@@ -92,14 +92,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_6
; GFX9-NEXT: ; %bb.1: ; %udiv-bb1
-; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v4, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v23, vcc, 1, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v4, vcc
; GFX9-NEXT: v_sub_u32_e32 v7, 0x7f, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v5, vcc
; GFX9-NEXT: v_sub_u32_e32 v12, 64, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v23, v25
-; GFX9-NEXT: v_or_b32_e32 v3, v22, v24
+; GFX9-NEXT: v_or_b32_e32 v4, v24, v26
+; GFX9-NEXT: v_or_b32_e32 v3, v23, v25
; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, v[10:11]
; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[8:9]
; GFX9-NEXT: v_sub_u32_e32 v2, 63, v2
@@ -113,99 +113,96 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, v[8:9]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, v5, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_5
; GFX9-NEXT: ; %bb.2: ; %udiv-preheader
-; GFX9-NEXT: v_sub_u32_e32 v12, 64, v22
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[8:9]
+; GFX9-NEXT: v_sub_u32_e32 v12, 64, v23
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v23, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[10:11]
-; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22
-; GFX9-NEXT: v_or_b32_e32 v12, v6, v12
-; GFX9-NEXT: v_subrev_u32_e32 v6, 64, v22
+; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23
+; GFX9-NEXT: v_or_b32_e32 v14, v6, v12
+; GFX9-NEXT: v_subrev_u32_e32 v6, 64, v23
; GFX9-NEXT: v_or_b32_e32 v13, v7, v13
; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[10:11]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23
; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v12, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[6:7], v22, v[10:11]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v8, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v6, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, -1, v21
-; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v20, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v12, v7, v9, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v6, v14, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[6:7], v23, v[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v9, v8, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v14, 0, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v6, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, -1, v22
+; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v21, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, -1, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
+; GFX9-NEXT: v_mov_b32_e32 v16, 0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_lshlrev_b64 v[30:31], 1, v[4:5]
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v5
-; GFX9-NEXT: v_or_b32_e32 v4, v14, v30
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
-; GFX9-NEXT: v_or_b32_e32 v5, v15, v31
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v3
-; GFX9-NEXT: v_or_b32_e32 v8, v8, v15
-; GFX9-NEXT: v_or_b32_e32 v10, v10, v14
-; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v26, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v27, v9, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v28, v10, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v29, v11, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v14
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v21
+; GFX9-NEXT: v_or_b32_e32 v4, v15, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 31, v12
+; GFX9-NEXT: v_lshlrev_b64 v[11:12], 1, v[11:12]
+; GFX9-NEXT: v_or_b32_e32 v5, v16, v31
+; GFX9-NEXT: v_lshlrev_b64 v[13:14], 1, v[13:14]
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v3
+; GFX9-NEXT: v_or_b32_e32 v11, v11, v16
+; GFX9-NEXT: v_or_b32_e32 v13, v13, v15
+; GFX9-NEXT: v_sub_co_u32_e32 v15, vcc, v10, v11
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v27, v12, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v28, v13, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v29, v14, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v30, 31, v15
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v22
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v14
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v20
-; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v14, vcc
-; GFX9-NEXT: v_or3_b32 v2, v2, v6, v12
+; GFX9-NEXT: v_sub_co_u32_e32 v11, vcc, v11, v15
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v21
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX9-NEXT: v_or3_b32 v2, v2, v6, v8
; GFX9-NEXT: v_and_b32_e32 v6, v30, v0
-; GFX9-NEXT: v_and_b32_e32 v14, v30, v1
-; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v6, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v11, v14, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v22
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-NEXT: v_and_b32_e32 v15, v30, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v6, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, v14, v15, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v23, vcc, -1, v23
; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v24, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
-; GFX9-NEXT: v_or_b32_e32 v14, v22, v24
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v26, vcc
; GFX9-NEXT: v_or_b32_e32 v15, v23, v25
-; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT: v_or_b32_e32 v16, v24, v26
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[15:16]
; GFX9-NEXT: v_and_b32_e32 v6, 1, v30
-; GFX9-NEXT: v_mov_b32_e32 v15, v7
-; GFX9-NEXT: v_or3_b32 v3, v3, 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v16, v7
+; GFX9-NEXT: v_or3_b32 v3, v3, 0, v9
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-NEXT: v_mov_b32_e32 v15, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB0_3
; GFX9-NEXT: ; %bb.4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB0_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[2:3]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[4:5]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT: v_or3_b32 v13, v3, 0, v13
-; GFX9-NEXT: v_or3_b32 v12, v2, v4, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v5
+; GFX9-NEXT: v_or_b32_e32 v12, v12, v2
; GFX9-NEXT: v_or_b32_e32 v7, v7, v1
; GFX9-NEXT: v_or_b32_e32 v6, v6, v0
; GFX9-NEXT: .LBB0_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT: v_xor_b32_e32 v2, v17, v16
-; GFX9-NEXT: v_xor_b32_e32 v3, v19, v18
+; GFX9-NEXT: v_xor_b32_e32 v2, v18, v17
+; GFX9-NEXT: v_xor_b32_e32 v3, v20, v19
; GFX9-NEXT: v_xor_b32_e32 v0, v6, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v7, v3
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
@@ -2268,21 +2265,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
-; GFX9-NEXT: v_or_b32_e32 v10, v13, v15
+; GFX9-NEXT: v_or_b32_e32 v11, v13, v15
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_xor_b32_e32 v9, 0x7f, v12
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT: v_or_b32_e32 v9, v9, v14
+; GFX9-NEXT: v_xor_b32_e32 v8, 0x7f, v12
+; GFX9-NEXT: v_or_b32_e32 v10, v8, v14
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5]
; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
@@ -2310,12 +2307,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15
; GFX9-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -2329,21 +2324,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v18
; GFX9-NEXT: v_or_b32_e32 v15, v13, v15
; GFX9-NEXT: v_lshrrev_b64 v[12:13], v12, v[2:3]
-; GFX9-NEXT: v_lshrrev_b64 v[2:3], v18, v[2:3]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v13, v1, s[4:5]
+; GFX9-NEXT: v_lshrrev_b64 v[1:2], v18, v[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4
; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5]
; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v16, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[4:5]
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_mov_b32_e32 v17, 0
; GFX9-NEXT: v_mov_b32_e32 v13, 0
@@ -2353,20 +2348,20 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
; GFX9-NEXT: v_or_b32_e32 v10, v16, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v1
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v15
+; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15]
; GFX9-NEXT: v_or_b32_e32 v2, v2, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v9
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v16
-; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v0
-; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v1, vcc
+; GFX9-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v22, v14
+; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v23, v15, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v24, v2, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v25, v3, vcc
; GFX9-NEXT: v_ashrrev_i32_e32 v26, 31, v16
; GFX9-NEXT: v_and_b32_e32 v16, v26, v4
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v16
+; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v16
; GFX9-NEXT: v_and_b32_e32 v16, v26, v5
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v16, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v16, vcc
; GFX9-NEXT: v_and_b32_e32 v16, v26, v6
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v16, vcc
; GFX9-NEXT: v_and_b32_e32 v16, v26, v7
@@ -2380,10 +2375,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v16, v18, v20
; GFX9-NEXT: v_or_b32_e32 v17, v19, v21
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GFX9-NEXT: v_or3_b32 v8, v8, v12, v14
+; GFX9-NEXT: v_or3_b32 v8, v8, v12, v0
; GFX9-NEXT: v_and_b32_e32 v12, 1, v26
; GFX9-NEXT: v_mov_b32_e32 v17, v13
-; GFX9-NEXT: v_or3_b32 v9, v9, 0, v15
+; GFX9-NEXT: v_or3_b32 v9, v9, 0, v1
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v16, v12
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
@@ -2392,19 +2387,18 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB1_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[10:11]
-; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v11
-; GFX9-NEXT: v_or3_b32 v8, v3, 0, v15
-; GFX9-NEXT: v_or3_b32 v9, v2, v4, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v11
+; GFX9-NEXT: v_or_b32_e32 v8, v8, v2
; GFX9-NEXT: v_or_b32_e32 v10, v13, v1
; GFX9-NEXT: v_or_b32_e32 v11, v12, v0
; GFX9-NEXT: .LBB1_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: v_mov_b32_e32 v0, v11
; GFX9-NEXT: v_mov_b32_e32 v1, v10
-; GFX9-NEXT: v_mov_b32_e32 v2, v9
-; GFX9-NEXT: v_mov_b32_e32 v3, v8
+; GFX9-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-NEXT: v_mov_b32_e32 v3, v9
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O0-LABEL: v_udiv_i128_vv:
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index ddac86b3719c2..1e96b63bcd321 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -7,309 +7,303 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v1, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v0, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v2, v21
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v21, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v18
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v19
; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
+; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1
; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT: v_or_b32_e32 v1, v21, v17
+; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT: v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22
+; SDAG-NEXT: v_min_u32_e32 v2, v21, v2
+; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v19, v22
-; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2
-; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7]
+; SDAG-NEXT: v_min_u32_e32 v1, v21, v22
+; SDAG-NEXT: v_add_i32_e64 v3, s[8:9], 64, v2
+; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v0, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc
-; SDAG-NEXT: v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7]
-; SDAG-NEXT: v_or_b32_e32 v2, v29, v0
-; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
-; SDAG-NEXT: v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT: v_min_u32_e32 v8, v8, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[6:7]
+; SDAG-NEXT: v_or_b32_e32 v0, v29, v2
+; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v2
+; SDAG-NEXT: v_or_b32_e32 v1, v28, v3
+; SDAG-NEXT: v_min_u32_e32 v9, v9, v21
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT: v_ffbh_u32_e32 v19, v1
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v2, v11, v19
-; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8
-; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v8, 0, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7]
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v0, v11, v21
+; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 64, v9
+; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, s[6:7]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v2, v10
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v9, vcc
-; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v8
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v0, v8
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v10, vcc
+; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v8
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v20, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
-; SDAG-NEXT: v_or_b32_e32 v3, v9, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v20, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v10
+; SDAG-NEXT: v_or_b32_e32 v1, v9, v11
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v19, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v2, 1, v2
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v21, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v17, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v16, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v19, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v18, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB0_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8
-; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v8
-; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v8
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[20:21], v18
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], v0
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_or_b32_e32 v9, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v11, vcc, 0x7f, v8
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8
; SDAG-NEXT: v_or_b32_e32 v10, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v11
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v11
-; SDAG-NEXT: v_lshl_b64 v[34:35], v[20:21], v11
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v34
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v34
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[18:19], v34
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; SDAG-NEXT: v_lshr_b64 v[8:9], v[20:21], v8
-; SDAG-NEXT: v_or_b32_e32 v9, v23, v9
-; SDAG-NEXT: v_or_b32_e32 v8, v22, v8
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v11
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v19, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v8, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v35, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v34, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v10, 0
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v8
+; SDAG-NEXT: v_or_b32_e32 v9, v21, v9
+; SDAG-NEXT: v_or_b32_e32 v8, v20, v8
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[2:3], v[20:21], v30
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v30
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[16:17], v10
-; SDAG-NEXT: v_or_b32_e32 v11, v3, v11
-; SDAG-NEXT: v_or_b32_e32 v10, v2, v10
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[18:19], v30
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 64, v30
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT: v_or_b32_e32 v21, v9, v21
+; SDAG-NEXT: v_or_b32_e32 v20, v8, v20
; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
-; SDAG-NEXT: v_subrev_i32_e64 v2, s[4:5], 64, v30
-; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v2
-; SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; SDAG-NEXT: v_subrev_i32_e64 v8, s[4:5], 64, v30
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v21, vcc
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v2, v20, s[4:5]
-; SDAG-NEXT: v_lshr_b64 v[2:3], v[16:17], v30
-; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v9, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v20, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v8, v18, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[8:9], v[16:17], v30
+; SDAG-NEXT: v_cndmask_b32_e32 v23, 0, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v22, 0, v8, vcc
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
-; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
-; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
-; SDAG-NEXT: v_mov_b32_e32 v10, 0
-; SDAG-NEXT: v_mov_b32_e32 v11, 0
-; SDAG-NEXT: s_mov_b64 s[4:5], 0
-; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v2, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v3, vcc
; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v2, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v21
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT: v_or_b32_e32 v19, v17, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v16, v18
-; SDAG-NEXT: v_or_b32_e32 v16, v22, v38
-; SDAG-NEXT: v_or_b32_e32 v17, v20, v39
-; SDAG-NEXT: v_or_b32_e32 v8, v8, v2
-; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v34, v17
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v35, v21, vcc
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v36, v16, vcc
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v37, v23, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v2
-; SDAG-NEXT: v_and_b32_e32 v20, v2, v29
-; SDAG-NEXT: v_and_b32_e32 v22, v2, v28
-; SDAG-NEXT: v_and_b32_e32 v38, v2, v0
-; SDAG-NEXT: v_and_b32_e32 v39, v2, v1
-; SDAG-NEXT: v_and_b32_e32 v2, 1, v2
-; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v17, v20
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v22, vcc
-; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v16, v38, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_or_b32_e32 v11, v21, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v20, v10
+; SDAG-NEXT: v_or_b32_e32 v20, v22, v38
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v39
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v18
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v19, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v20, vcc
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v23, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; SDAG-NEXT: v_and_b32_e32 v21, v8, v29
+; SDAG-NEXT: v_and_b32_e32 v22, v8, v28
+; SDAG-NEXT: v_and_b32_e32 v38, v8, v2
+; SDAG-NEXT: v_and_b32_e32 v39, v8, v3
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v18, v21
+; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v19, v22, vcc
+; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v38, vcc
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v39, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v30, v32
-; SDAG-NEXT: v_or_b32_e32 v17, v31, v33
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT: v_or_b32_e32 v20, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v21, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_or_b32_e32 v1, v17, v1
; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v8, v10, v8
-; SDAG-NEXT: v_mov_b32_e32 v17, v3
-; SDAG-NEXT: v_mov_b32_e32 v16, v2
+; SDAG-NEXT: v_or_b32_e32 v0, v16, v0
+; SDAG-NEXT: v_mov_b32_e32 v21, v9
+; SDAG-NEXT: v_mov_b32_e32 v20, v8
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB0_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB0_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[18:19], 1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
; SDAG-NEXT: v_or_b32_e32 v0, v0, v16
-; SDAG-NEXT: v_or_b32_e32 v18, v11, v1
-; SDAG-NEXT: v_or_b32_e32 v19, v3, v9
-; SDAG-NEXT: v_or_b32_e32 v22, v10, v0
-; SDAG-NEXT: v_or_b32_e32 v23, v2, v8
+; SDAG-NEXT: v_or_b32_e32 v20, v9, v3
+; SDAG-NEXT: v_or_b32_e32 v21, v8, v2
; SDAG-NEXT: .LBB0_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v7
-; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
+; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v20, v16
-; SDAG-NEXT: v_mov_b32_e32 v21, v17
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc
+; SDAG-NEXT: v_mov_b32_e32 v22, v18
+; SDAG-NEXT: v_mov_b32_e32 v23, v19
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v4, v3
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v0, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12
-; SDAG-NEXT: v_or_b32_e32 v0, v2, v8
-; SDAG-NEXT: v_ffbh_u32_e32 v6, v8
-; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 32, v1
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v9
-; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], 32, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v3, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v2, s[4:5]
+; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12
+; SDAG-NEXT: v_or_b32_e32 v2, v4, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v8
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], 32, v3
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v13, vcc
+; SDAG-NEXT: v_or_b32_e32 v3, v5, v9
+; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v10
; SDAG-NEXT: v_ffbh_u32_e32 v30, v9
-; SDAG-NEXT: v_min_u32_e32 v4, v7, v4
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, 0, v14, vcc
+; SDAG-NEXT: v_min_u32_e32 v6, v16, v6
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT: v_min_u32_e32 v1, v6, v30
-; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4
-; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, 0, v15, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v7, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v7, v29
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v28
+; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT: v_min_u32_e32 v3, v10, v30
+; SDAG-NEXT: v_add_i32_e64 v6, s[8:9], 64, v6
+; SDAG-NEXT: v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v14, v16, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v29
+; SDAG-NEXT: v_ffbh_u32_e32 v13, v28
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v6, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v4, v29, v0
-; SDAG-NEXT: v_ffbh_u32_e32 v6, v0
-; SDAG-NEXT: v_add_i32_e32 v7, vcc, 32, v7
-; SDAG-NEXT: v_or_b32_e32 v5, v28, v1
-; SDAG-NEXT: v_add_i32_e32 v6, vcc, 32, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v14, v1
-; SDAG-NEXT: v_min_u32_e32 v7, v7, v10
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_min_u32_e32 v4, v6, v14
-; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v7
-; SDAG-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v6, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v15, v10, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v6, v29, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v2
+; SDAG-NEXT: v_add_i32_e32 v12, vcc, 32, v12
+; SDAG-NEXT: v_or_b32_e32 v7, v28, v3
+; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v15, v3
+; SDAG-NEXT: v_min_u32_e32 v12, v12, v13
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_min_u32_e32 v6, v10, v15
+; SDAG-NEXT: v_add_i32_e64 v7, s[4:5], 64, v12
+; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v6, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
-; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v13
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v4, 0x7f, v6
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v16
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v14, vcc
+; SDAG-NEXT: v_xor_b32_e32 v12, 0x7f, v6
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v11, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v10
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v5, v7, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v13, v7, v11
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_and_b32_e32 v4, 1, v12
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4
+; SDAG-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT: v_and_b32_e32 v12, 1, v14
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6
; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6
-; SDAG-NEXT: v_mov_b32_e32 v4, 0
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc
-; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[4:5], v12
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
; SDAG-NEXT: v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0x7f, v6
; SDAG-NEXT: v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v34
-; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[8:9], v16
+; SDAG-NEXT: v_sub_i32_e32 v17, vcc, 64, v16
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v16
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v17
; SDAG-NEXT: v_or_b32_e32 v7, v7, v11
; SDAG-NEXT: v_or_b32_e32 v6, v6, v10
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v16
; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v12, 0
@@ -318,108 +312,106 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB0_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[14:15], v[2:3], v30
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, 64, v30
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v30
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_lshr_b64 v[37:38], v[8:9], v30
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
-; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: v_mov_b32_e32 v14, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v5, 0
-; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v4
+; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[8:9], v12
; SDAG-NEXT: v_lshr_b64 v[8:9], v[8:9], v35
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v4, v15, v49
-; SDAG-NEXT: v_or_b32_e32 v14, v14, v48
-; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v12, v17, v49
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v48
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v2, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v15, 0, v38, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, v37, s[4:5]
-; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v12, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v37, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v3, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT: v_cndmask_b32_e32 v9, v4, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc
-; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: v_mov_b32_e32 v5, 0
; SDAG-NEXT: .LBB0_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v9
; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7
; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11
; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v4, v14, v4
+; SDAG-NEXT: v_or_b32_e32 v12, v16, v12
; SDAG-NEXT: v_or_b32_e32 v8, v8, v38
; SDAG-NEXT: v_or_b32_e32 v6, v6, v39
-; SDAG-NEXT: v_or_b32_e32 v7, v13, v7
-; SDAG-NEXT: v_or_b32_e32 v11, v3, v11
-; SDAG-NEXT: v_sub_i32_e32 v3, vcc, v34, v8
-; SDAG-NEXT: v_or_b32_e32 v6, v12, v6
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v35, v9, vcc
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v36, v4, vcc
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v37, v15, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v3
-; SDAG-NEXT: v_and_b32_e32 v3, v38, v29
-; SDAG-NEXT: v_and_b32_e32 v14, v38, v28
-; SDAG-NEXT: v_and_b32_e32 v39, v38, v0
-; SDAG-NEXT: v_and_b32_e32 v48, v38, v1
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v3
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc
-; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v4, v39, vcc
-; SDAG-NEXT: v_subb_u32_e32 v15, vcc, v15, v48, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT: v_or_b32_e32 v11, v5, v11
+; SDAG-NEXT: v_sub_i32_e32 v5, vcc, v34, v8
+; SDAG-NEXT: v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v35, v9, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v36, v12, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v37, v17, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v5
+; SDAG-NEXT: v_and_b32_e32 v16, v5, v29
+; SDAG-NEXT: v_and_b32_e32 v38, v5, v28
+; SDAG-NEXT: v_and_b32_e32 v39, v5, v2
+; SDAG-NEXT: v_and_b32_e32 v48, v5, v3
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v16
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v38, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v12, v39, vcc
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v48, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT: v_or_b32_e32 v3, v30, v32
-; SDAG-NEXT: v_or_b32_e32 v4, v31, v33
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[3:4]
-; SDAG-NEXT: v_and_b32_e32 v4, 1, v38
+; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT: v_and_b32_e32 v12, 1, v5
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v2, v10
-; SDAG-NEXT: v_mov_b32_e32 v2, v4
-; SDAG-NEXT: v_mov_b32_e32 v3, v5
+; SDAG-NEXT: v_or_b32_e32 v10, v4, v10
+; SDAG-NEXT: v_mov_b32_e32 v4, v12
+; SDAG-NEXT: v_mov_b32_e32 v5, v13
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB0_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB0_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[6:7], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v6
-; SDAG-NEXT: v_or_b32_e32 v13, v13, v1
-; SDAG-NEXT: v_or_b32_e32 v14, v5, v3
-; SDAG-NEXT: v_or_b32_e32 v5, v12, v0
-; SDAG-NEXT: v_or_b32_e32 v4, v4, v2
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v4
+; SDAG-NEXT: v_or_b32_e32 v13, v13, v3
+; SDAG-NEXT: v_or_b32_e32 v12, v12, v2
; SDAG-NEXT: .LBB0_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT: v_xor_b32_e32 v7, v21, v20
-; SDAG-NEXT: v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT: v_xor_b32_e32 v8, v18, v3
-; SDAG-NEXT: v_xor_b32_e32 v9, v22, v2
-; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3
-; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2
+; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22
+; SDAG-NEXT: v_xor_b32_e32 v6, v19, v18
+; SDAG-NEXT: v_xor_b32_e32 v4, v1, v3
+; SDAG-NEXT: v_xor_b32_e32 v5, v0, v2
+; SDAG-NEXT: v_xor_b32_e32 v1, v20, v3
+; SDAG-NEXT: v_xor_b32_e32 v0, v21, v2
+; SDAG-NEXT: v_xor_b32_e32 v8, v15, v7
+; SDAG-NEXT: v_xor_b32_e32 v9, v14, v6
; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7
-; SDAG-NEXT: v_xor_b32_e32 v11, v5, v6
-; SDAG-NEXT: v_xor_b32_e32 v5, v14, v7
; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v9, v2, vcc
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v3, vcc
-; SDAG-NEXT: v_xor_b32_e32 v4, v4, v6
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; SDAG-NEXT: v_xor_b32_e32 v4, v12, v6
; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
-; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v11, v6, vcc
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v10, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: v_sdiv_v2i128_vv:
@@ -834,189 +826,187 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-LABEL: v_udiv_v2i128_vv:
; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_or_b32_e32 v17, v9, v11
-; SDAG-NEXT: v_or_b32_e32 v16, v8, v10
-; SDAG-NEXT: v_or_b32_e32 v19, v1, v3
-; SDAG-NEXT: v_or_b32_e32 v18, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v17, v3
+; SDAG-NEXT: v_mov_b32_e32 v16, v2
+; SDAG-NEXT: v_or_b32_e32 v3, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v2, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v19, v1, v17
+; SDAG-NEXT: v_or_b32_e32 v18, v0, v16
; SDAG-NEXT: v_ffbh_u32_e32 v20, v10
; SDAG-NEXT: v_ffbh_u32_e32 v21, v11
; SDAG-NEXT: v_ffbh_u32_e32 v22, v8
; SDAG-NEXT: v_ffbh_u32_e32 v23, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v24, v2
-; SDAG-NEXT: v_ffbh_u32_e32 v25, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v16
+; SDAG-NEXT: v_ffbh_u32_e32 v25, v17
; SDAG-NEXT: v_ffbh_u32_e32 v26, v0
; SDAG-NEXT: v_ffbh_u32_e32 v27, v1
; SDAG-NEXT: v_mov_b32_e32 v28, 0
; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
-; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
-; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22
; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
-; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
-; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v21
+; SDAG-NEXT: v_min_u32_e32 v3, v3, v23
; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v20, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v21, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v16, v18
-; SDAG-NEXT: v_subb_u32_e32 v22, vcc, v20, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v21
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[21:22]
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v2, v18
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v22, v3, vcc
+; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v20
+; SDAG-NEXT: v_subb_u32_e32 v22, vcc, 0, v28, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[20:21]
; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v23
-; SDAG-NEXT: v_or_b32_e32 v17, v22, v24
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[23:24]
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v22
+; SDAG-NEXT: v_or_b32_e32 v3, v21, v23
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[23:24]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[22:23]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v19, v18, s[4:5]
+; SDAG-NEXT: v_and_b32_e32 v2, 1, v2
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v17, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v16, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB1_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v21
-; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v21
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v20
+; SDAG-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v20
+; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v21, vcc
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[0:1], v2
+; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v22, vcc
+; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v26, v28
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v20
+; SDAG-NEXT: v_or_b32_e32 v19, v27, v29
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v24
+; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v24
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v25
+; SDAG-NEXT: v_or_b32_e32 v19, v21, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v20, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v22, vcc
-; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
-; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v23, vcc
-; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v22, v18, v28
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v21
-; SDAG-NEXT: v_or_b32_e32 v23, v27, v29
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v26
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[30:31], v[0:1], v26
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v21
-; SDAG-NEXT: v_or_b32_e32 v22, v25, v22
-; SDAG-NEXT: v_or_b32_e32 v21, v24, v21
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v17, v22, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v16, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v31, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v30, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, v21, v2, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v18
-; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v18
-; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v23
-; SDAG-NEXT: v_or_b32_e32 v24, v20, v24
-; SDAG-NEXT: v_or_b32_e32 v23, v19, v23
-; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
-; SDAG-NEXT: v_subrev_i32_e64 v19, s[4:5], 64, v18
-; SDAG-NEXT: v_lshr_b64 v[19:20], v[2:3], v19
-; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v24, vcc
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v20, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e32 v19, v19, v23, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v19, v0, s[4:5]
-; SDAG-NEXT: v_lshr_b64 v[2:3], v[2:3], v18
-; SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v26
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v22
+; SDAG-NEXT: v_or_b32_e32 v23, v21, v23
+; SDAG-NEXT: v_or_b32_e32 v22, v20, v22
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; SDAG-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v26
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT: v_cndmask_b32_e32 v21, v21, v23, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v21, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v20, v22, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v20, v0, s[4:5]
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[16:17], v26
+; SDAG-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
-; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_mov_b64 s[4:5], 0
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: v_mov_b32_e32 v25, 0
-; SDAG-NEXT: v_mov_b32_e32 v26, 0
; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v17
+; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
-; SDAG-NEXT: v_or_b32_e32 v17, v26, v17
-; SDAG-NEXT: v_or_b32_e32 v16, v25, v16
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v1
; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v19
-; SDAG-NEXT: v_lshrrev_b32_e32 v19, 31, v22
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v19
-; SDAG-NEXT: v_sub_i32_e32 v19, vcc, v30, v0
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v31, v1, vcc
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v32, v2, vcc
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v33, v3, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v19
-; SDAG-NEXT: v_and_b32_e32 v25, v19, v8
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25
-; SDAG-NEXT: v_and_b32_e32 v25, v19, v9
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v25, vcc
-; SDAG-NEXT: v_and_b32_e32 v25, v19, v10
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v25, vcc
-; SDAG-NEXT: v_and_b32_e32 v25, v19, v11
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v25, vcc
-; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v20
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v30, v0
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v31, v1, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v32, v16, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v33, v17, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v20, 31, v20
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v8
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v24
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v9
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v24, vcc
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v10
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v16, v24, vcc
+; SDAG-NEXT: v_and_b32_e32 v24, v20, v11
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v17, v24, vcc
+; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v26
; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
-; SDAG-NEXT: v_or_b32_e32 v25, v18, v28
-; SDAG-NEXT: v_or_b32_e32 v26, v27, v29
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26]
-; SDAG-NEXT: v_and_b32_e32 v19, 1, v19
-; SDAG-NEXT: v_lshl_b64 v[21:22], v[21:22], 1
-; SDAG-NEXT: v_or_b32_e32 v21, v21, v34
-; SDAG-NEXT: v_or_b32_e32 v22, v24, v22
+; SDAG-NEXT: v_or_b32_e32 v24, v26, v28
+; SDAG-NEXT: v_or_b32_e32 v25, v27, v29
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; SDAG-NEXT: v_and_b32_e32 v20, 1, v20
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v34
+; SDAG-NEXT: v_or_b32_e32 v3, v23, v3
; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
-; SDAG-NEXT: v_mov_b32_e32 v26, v20
-; SDAG-NEXT: v_mov_b32_e32 v25, v19
+; SDAG-NEXT: v_or_b32_e32 v2, v22, v2
+; SDAG-NEXT: v_mov_b32_e32 v25, v21
+; SDAG-NEXT: v_mov_b32_e32 v24, v20
; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
; SDAG-NEXT: s_cbranch_execnz .LBB1_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB1_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[0:1], v[21:22], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v17
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[16:17], 1
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
-; SDAG-NEXT: v_or_b32_e32 v16, v24, v1
-; SDAG-NEXT: v_or_b32_e32 v18, v20, v3
-; SDAG-NEXT: v_or_b32_e32 v17, v23, v0
-; SDAG-NEXT: v_or_b32_e32 v19, v19, v2
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v8
+; SDAG-NEXT: v_or_b32_e32 v18, v21, v1
+; SDAG-NEXT: v_or_b32_e32 v19, v20, v0
; SDAG-NEXT: .LBB1_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
-; SDAG-NEXT: v_or_b32_e32 v3, v5, v7
-; SDAG-NEXT: v_or_b32_e32 v2, v4, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v8, v14
-; SDAG-NEXT: v_ffbh_u32_e32 v9, v15
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v12
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v13
+; SDAG-NEXT: v_or_b32_e32 v9, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v8, v4, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v14
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v13
; SDAG-NEXT: v_ffbh_u32_e32 v20, v6
; SDAG-NEXT: v_ffbh_u32_e32 v21, v7
; SDAG-NEXT: v_ffbh_u32_e32 v22, v4
@@ -1024,139 +1014,137 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8
-; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10
-; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20
-; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v10
+; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v16
+; SDAG-NEXT: v_add_i32_e64 v8, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 32, v22
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_min_u32_e32 v0, v0, v9
-; SDAG-NEXT: v_min_u32_e32 v1, v1, v11
-; SDAG-NEXT: v_min_u32_e32 v2, v2, v21
-; SDAG-NEXT: v_min_u32_e32 v3, v3, v23
+; SDAG-NEXT: v_min_u32_e32 v0, v0, v11
+; SDAG-NEXT: v_min_u32_e32 v1, v1, v17
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v21
+; SDAG-NEXT: v_min_u32_e32 v9, v9, v23
; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1
-; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc
-; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
-; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v9, vcc, 64, v9
+; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v1, vcc
-; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v2
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc
-; SDAG-NEXT: v_or_b32_e32 v0, v0, v20
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v1, v3, v21
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v8
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v10, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v24, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v24, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v1, v17
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v10
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v2
-; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v2
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
-; SDAG-NEXT: v_mov_b32_e32 v1, 0
-; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v3, vcc
+; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0
+; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v0
+; SDAG-NEXT: v_addc_u32_e32 v23, vcc, 0, v1, vcc
; SDAG-NEXT: v_lshl_b64 v[8:9], v[4:5], v8
-; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v20, vcc
-; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v21, vcc
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v16, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc
; SDAG-NEXT: v_or_b32_e32 v10, v22, v24
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v2
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0x7f, v0
; SDAG-NEXT: v_or_b32_e32 v11, v23, v25
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], v26
-; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v26
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], v20
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 64, v20
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[4:5], v20
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v27
-; SDAG-NEXT: v_or_b32_e32 v3, v3, v11
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v10
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v21
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v11
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v10
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v20
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v16, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB1_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[10:11], v[4:5], v22
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v22
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v22
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v22
; SDAG-NEXT: v_subrev_i32_e32 v27, vcc, 64, v22
; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v22
; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v1, 0
-; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v16
; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v27
; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v0, v11, v32
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v31
+; SDAG-NEXT: v_or_b32_e32 v16, v21, v32
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v31
; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v7, v0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v30, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v29, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v30, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v29, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v22
-; SDAG-NEXT: v_cndmask_b32_e32 v7, v0, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
; SDAG-NEXT: v_mov_b32_e32 v4, 0
; SDAG-NEXT: v_mov_b32_e32 v5, 0
; SDAG-NEXT: .LBB1_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v7
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v7
; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v0
-; SDAG-NEXT: v_or_b32_e32 v0, v6, v30
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v31
-; SDAG-NEXT: v_or_b32_e32 v3, v21, v3
-; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v26, v0
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v27, v7, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v28, v10, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v29, v11, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v6
-; SDAG-NEXT: v_and_b32_e32 v31, v30, v13
-; SDAG-NEXT: v_and_b32_e32 v6, v30, v12
-; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v0, v6
-; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v31, vcc
-; SDAG-NEXT: v_or_b32_e32 v9, v5, v9
-; SDAG-NEXT: v_or_b32_e32 v2, v20, v2
-; SDAG-NEXT: v_and_b32_e32 v0, 1, v30
+; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v16
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v30
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v31
+; SDAG-NEXT: v_or_b32_e32 v1, v9, v1
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v26, v6
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v27, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v28, v20, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v29, v21, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v30, 31, v16
+; SDAG-NEXT: v_and_b32_e32 v16, v30, v13
+; SDAG-NEXT: v_and_b32_e32 v31, v30, v12
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v31
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v16, vcc
+; SDAG-NEXT: v_or_b32_e32 v11, v5, v11
+; SDAG-NEXT: v_or_b32_e32 v0, v8, v0
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v30
; SDAG-NEXT: v_and_b32_e32 v5, v30, v15
; SDAG-NEXT: v_and_b32_e32 v30, v30, v14
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v30, vcc
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v20, v30, vcc
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v21, v5, vcc
; SDAG-NEXT: v_add_i32_e32 v22, vcc, -1, v22
; SDAG-NEXT: v_addc_u32_e32 v23, vcc, -1, v23, vcc
; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
@@ -1165,33 +1153,29 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v30, v22, v24
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31]
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v8, v4, v8
-; SDAG-NEXT: v_mov_b32_e32 v5, v1
-; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_or_b32_e32 v10, v4, v10
+; SDAG-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-NEXT: v_mov_b32_e32 v5, v17
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB1_9
; SDAG-NEXT: ; %bb.10: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB1_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v6, 31, v9
-; SDAG-NEXT: v_lshl_b64 v[4:5], v[8:9], 1
-; SDAG-NEXT: v_or_b32_e32 v2, v2, v6
-; SDAG-NEXT: v_or_b32_e32 v8, v21, v3
-; SDAG-NEXT: v_or_b32_e32 v10, v1, v5
-; SDAG-NEXT: v_or_b32_e32 v9, v20, v2
-; SDAG-NEXT: v_or_b32_e32 v11, v0, v4
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[10:11], 1
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v4
+; SDAG-NEXT: v_or_b32_e32 v10, v17, v1
+; SDAG-NEXT: v_or_b32_e32 v11, v16, v0
; SDAG-NEXT: .LBB1_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: v_mov_b32_e32 v0, v19
; SDAG-NEXT: v_mov_b32_e32 v1, v18
-; SDAG-NEXT: v_mov_b32_e32 v2, v17
-; SDAG-NEXT: v_mov_b32_e32 v3, v16
; SDAG-NEXT: v_mov_b32_e32 v4, v11
; SDAG-NEXT: v_mov_b32_e32 v5, v10
-; SDAG-NEXT: v_mov_b32_e32 v6, v9
-; SDAG-NEXT: v_mov_b32_e32 v7, v8
+; SDAG-NEXT: v_mov_b32_e32 v6, v8
+; SDAG-NEXT: v_mov_b32_e32 v7, v9
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: v_udiv_v2i128_vv:
@@ -1633,97 +1617,95 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v17, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v34, v16, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB2_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
-; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_sub_i32_e64 v8, s[4:5], 63, v10
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[16:17], v8
; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10
; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[16:17], v24
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v22
+; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[16:17], v22
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v25
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[16:17], v23
; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, v20, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[22:23], v[16:17], v32
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 64, v32
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 64, v32
; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32
; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32
; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v9, 0
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v8
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v10
; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
-; SDAG-NEXT: v_or_b32_e32 v8, v23, v27
+; SDAG-NEXT: v_or_b32_e32 v10, v23, v27
; SDAG-NEXT: v_or_b32_e32 v22, v22, v26
; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v2, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v49, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v49, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v22, v48, v22, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v3, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT: v_cndmask_b32_e32 v25, v8, v17, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v10, v17, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v16, vcc
; SDAG-NEXT: v_mov_b32_e32 v22, 0
; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v25
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
-; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
; SDAG-NEXT: v_or_b32_e32 v22, v26, v48
; SDAG-NEXT: v_or_b32_e32 v23, v24, v49
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v36, v23
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v25, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v38, v22, vcc
-; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v39, v27, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; SDAG-NEXT: v_and_b32_e32 v24, v8, v31
-; SDAG-NEXT: v_and_b32_e32 v26, v8, v30
-; SDAG-NEXT: v_and_b32_e32 v48, v8, v2
-; SDAG-NEXT: v_and_b32_e32 v49, v8, v3
-; SDAG-NEXT: v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v10
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v36, v23
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v25, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v38, v22, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v39, v27, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v10, 31, v10
+; SDAG-NEXT: v_and_b32_e32 v24, v10, v31
+; SDAG-NEXT: v_and_b32_e32 v26, v10, v30
+; SDAG-NEXT: v_and_b32_e32 v48, v10, v2
+; SDAG-NEXT: v_and_b32_e32 v49, v10, v3
+; SDAG-NEXT: v_and_b32_e32 v10, 1, v10
; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v23, v24
; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v26, vcc
; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v22, v48, vcc
@@ -1735,132 +1717,128 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v22, v32, v34
; SDAG-NEXT: v_or_b32_e32 v23, v33, v35
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
-; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
+; SDAG-NEXT: v_or_b32_e32 v9, v21, v9
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v18, v10
-; SDAG-NEXT: v_mov_b32_e32 v23, v9
-; SDAG-NEXT: v_mov_b32_e32 v22, v8
+; SDAG-NEXT: v_or_b32_e32 v8, v20, v8
+; SDAG-NEXT: v_mov_b32_e32 v23, v11
+; SDAG-NEXT: v_mov_b32_e32 v22, v10
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB2_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v22
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v34, v19, v11
-; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
-; SDAG-NEXT: v_or_b32_e32 v27, v9, v21
-; SDAG-NEXT: v_or_b32_e32 v33, v8, v20
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v20
+; SDAG-NEXT: v_or_b32_e32 v33, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v34, v10, v18
; SDAG-NEXT: .LBB2_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_ashrrev_i32_e32 v32, 31, v7
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT: v_mov_b32_e32 v35, v26
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc
+; SDAG-NEXT: v_mov_b32_e32 v35, v32
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v6, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v10, v8
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v9
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v18, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v11
; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5]
-; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0, v12
-; SDAG-NEXT: v_or_b32_e32 v6, v8, v4
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v4
-; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v10
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v7, v9, v5
-; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v20
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v5
-; SDAG-NEXT: v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v14, vcc
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v12
+; SDAG-NEXT: v_or_b32_e32 v6, v10, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v4
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v13, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v11, v5
+; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v5
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v14, vcc
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v21, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
-; SDAG-NEXT: v_min_u32_e32 v7, v20, v22
-; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10
-; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v15, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v11, s[4:5]
-; SDAG-NEXT: v_ffbh_u32_e32 v11, v37
-; SDAG-NEXT: v_ffbh_u32_e32 v14, v36
+; SDAG-NEXT: v_min_u32_e32 v7, v22, v24
+; SDAG-NEXT: v_add_i32_e64 v12, s[8:9], 64, v18
+; SDAG-NEXT: v_addc_u32_e64 v13, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v15, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v20, s[4:5]
+; SDAG-NEXT: v_ffbh_u32_e32 v14, v37
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v36
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v19, v10, v7, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v13, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v10, v37, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v13, v6
-; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v11
-; SDAG-NEXT: v_or_b32_e32 v11, v36, v7
-; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v7
-; SDAG-NEXT: v_min_u32_e32 v14, v15, v14
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT: v_min_u32_e32 v10, v13, v20
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14
-; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v22, v12, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v18, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v12, v37, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v15, v6
+; SDAG-NEXT: v_add_i32_e32 v14, vcc, 32, v14
+; SDAG-NEXT: v_or_b32_e32 v13, v36, v7
+; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v7
+; SDAG-NEXT: v_min_u32_e32 v14, v14, v20
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT: v_min_u32_e32 v12, v15, v18
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], 64, v14
+; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
-; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc
-; SDAG-NEXT: v_xor_b32_e32 v12, 0x7f, v10
-; SDAG-NEXT: v_subb_u32_e32 v14, vcc, 0, v18, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v15, vcc, 0, v18, vcc
-; SDAG-NEXT: v_or_b32_e32 v12, v12, v14
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v22
+; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v14, v21, vcc
+; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v12
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v18
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v15, v13, v19
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v13, v11, v15
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT: v_and_b32_e32 v12, 1, v18
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12
+; SDAG-NEXT: v_and_b32_e32 v14, 1, v20
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v5, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v13, v9, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v8, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v10, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10
-; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v10
-; SDAG-NEXT: v_mov_b32_e32 v12, 0
-; SDAG-NEXT: v_mov_b32_e32 v13, 0
-; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v11, vcc
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[8:9], v18
-; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc
-; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT: v_or_b32_e32 v14, v38, v48
-; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 0x7f, v10
-; SDAG-NEXT: v_or_b32_e32 v15, v39, v49
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[4:5], v22
-; SDAG-NEXT: v_sub_i32_e32 v23, vcc, 64, v22
-; SDAG-NEXT: v_lshl_b64 v[20:21], v[8:9], v22
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_lshr_b64 v[14:15], v[8:9], v23
-; SDAG-NEXT: v_or_b32_e32 v11, v11, v15
-; SDAG-NEXT: v_or_b32_e32 v10, v10, v14
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v22
-; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v11, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v10, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v21, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v20, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22
+; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12
+; SDAG-NEXT: v_sub_i32_e64 v14, s[4:5], 63, v12
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc
+; SDAG-NEXT: v_lshl_b64 v[13:14], v[10:11], v14
+; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v18, vcc
+; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v38, v48
+; SDAG-NEXT: v_sub_i32_e32 v15, vcc, 0x7f, v12
+; SDAG-NEXT: v_or_b32_e32 v19, v39, v49
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v15
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v15
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[10:11], v15
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[10:11], v12
+; SDAG-NEXT: v_or_b32_e32 v12, v21, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v20, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v13, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v18, 0
@@ -1869,61 +1847,61 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB2_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[20:21], v[8:9], v38
-; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v38
+; SDAG-NEXT: v_lshr_b64 v[22:23], v[10:11], v38
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v38
; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38
-; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38
+; SDAG-NEXT: v_lshr_b64 v[24:25], v[4:5], v38
; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37
-; SDAG-NEXT: v_mov_b32_e32 v18, 0
-; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v13, 0
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v12
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v18
; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51
; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc
-; SDAG-NEXT: v_or_b32_e32 v12, v21, v25
-; SDAG-NEXT: v_or_b32_e32 v20, v20, v24
+; SDAG-NEXT: v_or_b32_e32 v18, v23, v27
+; SDAG-NEXT: v_or_b32_e32 v22, v22, v26
; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v6, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38
-; SDAG-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v53, v20, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v54, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v53, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v7, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38
-; SDAG-NEXT: v_cndmask_b32_e32 v23, v12, v9, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v22, v20, v8, vcc
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v18, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v22, v10, vcc
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: .LBB2_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v25
; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v12, 31, v23
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15
; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT: v_or_b32_e32 v24, v24, v12
-; SDAG-NEXT: v_or_b32_e32 v22, v22, v54
-; SDAG-NEXT: v_or_b32_e32 v12, v14, v55
-; SDAG-NEXT: v_or_b32_e32 v15, v19, v15
-; SDAG-NEXT: v_or_b32_e32 v11, v21, v11
-; SDAG-NEXT: v_or_b32_e32 v14, v18, v12
-; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v50, v22
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v51, v23, vcc
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v52, v24, vcc
-; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v53, v25, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v12
-; SDAG-NEXT: v_and_b32_e32 v12, 1, v21
-; SDAG-NEXT: v_and_b32_e32 v54, v21, v7
-; SDAG-NEXT: v_and_b32_e32 v55, v21, v6
-; SDAG-NEXT: v_and_b32_e32 v40, v21, v36
-; SDAG-NEXT: v_and_b32_e32 v21, v21, v37
-; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v40, vcc
-; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v24, v55, vcc
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v54, vcc
+; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v26, v26, v18
+; SDAG-NEXT: v_or_b32_e32 v24, v24, v54
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v55
+; SDAG-NEXT: v_or_b32_e32 v15, v21, v15
+; SDAG-NEXT: v_or_b32_e32 v13, v23, v13
+; SDAG-NEXT: v_or_b32_e32 v14, v20, v14
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v50, v24
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v51, v25, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v52, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v53, v27, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v18
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v23
+; SDAG-NEXT: v_and_b32_e32 v54, v23, v7
+; SDAG-NEXT: v_and_b32_e32 v55, v23, v6
+; SDAG-NEXT: v_and_b32_e32 v40, v23, v36
+; SDAG-NEXT: v_and_b32_e32 v23, v23, v37
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v23
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v40, vcc
+; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v55, vcc
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v54, vcc
; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v38
; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc
; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v48, vcc
@@ -1932,9 +1910,9 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v54, v38, v48
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55]
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v10, v20, v10
-; SDAG-NEXT: v_mov_b32_e32 v21, v13
-; SDAG-NEXT: v_mov_b32_e32 v20, v12
+; SDAG-NEXT: v_or_b32_e32 v12, v22, v12
+; SDAG-NEXT: v_mov_b32_e32 v23, v19
+; SDAG-NEXT: v_mov_b32_e32 v22, v18
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB2_9
; SDAG-NEXT: ; %bb.10: ; %Flow
@@ -1942,77 +1920,75 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: .LBB2_11: ; %Flow11
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v11
-; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
; SDAG-NEXT: v_or_b32_e32 v14, v14, v20
-; SDAG-NEXT: v_or_b32_e32 v19, v19, v15
-; SDAG-NEXT: v_or_b32_e32 v13, v13, v11
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v14
-; SDAG-NEXT: v_or_b32_e32 v12, v12, v10
+; SDAG-NEXT: v_or_b32_e32 v20, v19, v13
+; SDAG-NEXT: v_or_b32_e32 v21, v18, v12
; SDAG-NEXT: .LBB2_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v14, v33, v3
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
-; SDAG-NEXT: v_mul_lo_u32 v15, v27, v2
-; SDAG-NEXT: v_mul_lo_u32 v23, v34, v31
-; SDAG-NEXT: v_mul_lo_u32 v24, v32, v30
-; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v31, v33, 0
-; SDAG-NEXT: v_mov_b32_e32 v22, 0
-; SDAG-NEXT: v_mul_lo_u32 v25, v12, v7
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; SDAG-NEXT: v_mul_lo_u32 v34, v13, v6
-; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37
-; SDAG-NEXT: v_mul_lo_u32 v38, v18, v36
-; SDAG-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[21:22]
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v20
-; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v25
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
-; SDAG-NEXT: v_mov_b32_e32 v21, v6
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v31, v27, v[21:22]
-; SDAG-NEXT: v_xor_b32_e32 v16, v16, v28
-; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v34
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
-; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v15
+; SDAG-NEXT: v_mul_lo_u32 v18, v34, v3
+; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v34, v2, 0
+; SDAG-NEXT: v_mul_lo_u32 v19, v33, v2
+; SDAG-NEXT: v_mul_lo_u32 v25, v9, v31
+; SDAG-NEXT: v_mul_lo_u32 v26, v8, v30
+; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v31, v34, 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mul_lo_u32 v9, v21, v7
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v21, v6, 0
+; SDAG-NEXT: v_mul_lo_u32 v27, v20, v6
+; SDAG-NEXT: v_mul_lo_u32 v38, v15, v37
+; SDAG-NEXT: v_mul_lo_u32 v39, v14, v36
+; SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v18
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v34, v[23:24]
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v22
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v9
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
+; SDAG-NEXT: v_mov_b32_e32 v23, v6
+; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v33, v[23:24]
+; SDAG-NEXT: v_xor_b32_e32 v18, v18, v28
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v3, v27
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v8, v31, v[12:13]
+; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v7, v16
; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v14, vcc, v17, v14, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v37, v12, 0
-; SDAG-NEXT: v_add_i32_e64 v15, s[4:5], v23, v11
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v27, v[6:7]
-; SDAG-NEXT: v_xor_b32_e32 v17, v14, v29
-; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v36, v12, v[21:22]
-; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v24, v15
+; SDAG-NEXT: v_subb_u32_e32 v12, vcc, v17, v15, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v37, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v37, v21, 0
+; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], v25, v9
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v30, v33, v[6:7]
+; SDAG-NEXT: v_xor_b32_e32 v16, v12, v29
; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v38, v3
-; SDAG-NEXT: v_mov_b32_e32 v21, v11
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v13, v[21:22]
-; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10
-; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], v7, v18, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v36, v21, v[23:24]
+; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v39, v3
+; SDAG-NEXT: v_mov_b32_e32 v23, v12
+; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v37, v20, v[23:24]
+; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8
+; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], v7, v9, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc
-; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v12, v15
+; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v13, v15
; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v10, vcc
-; SDAG-NEXT: v_xor_b32_e32 v10, v0, v28
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v13, v[6:7]
-; SDAG-NEXT: v_xor_b32_e32 v11, v1, v29
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v16, v28
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, v0, v28
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v36, v20, v[6:7]
+; SDAG-NEXT: v_xor_b32_e32 v9, v1, v29
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v18, v28
; SDAG-NEXT: v_add_i32_e64 v6, s[4:5], v6, v2
; SDAG-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v3, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v17, v29, vcc
-; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v10, v28, vcc
-; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v11, v29, vcc
-; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v20
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v14, vcc
-; SDAG-NEXT: v_xor_b32_e32 v8, v8, v26
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v16, v29, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v8, v28, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v9, v29, vcc
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v10, v22
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v14, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, v8, v32
; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc
; SDAG-NEXT: v_xor_b32_e32 v6, v9, v35
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v7, vcc
-; SDAG-NEXT: v_xor_b32_e32 v7, v4, v26
+; SDAG-NEXT: v_xor_b32_e32 v7, v4, v32
; SDAG-NEXT: v_xor_b32_e32 v9, v5, v35
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v26
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v8, v32
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v6, v35, vcc
-; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v7, v32, vcc
; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v35, vcc
; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2512,97 +2488,95 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v32, v1, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v0, 0, s[4:5]
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9]
; SDAG-NEXT: s_cbranch_execz .LBB3_6
; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18
-; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v18
; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v19, vcc
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc
; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc
; SDAG-NEXT: v_or_b32_e32 v19, v30, v32
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0x7f, v18
+; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 0x7f, v18
; SDAG-NEXT: v_or_b32_e32 v20, v31, v33
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[2:3], v21
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v21
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v21
+; SDAG-NEXT: v_lshl_b64 v[21:22], v[2:3], v25
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v25
+; SDAG-NEXT: v_lshl_b64 v[23:24], v[0:1], v25
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
-; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v26, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v21
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v3, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v2, s[4:5]
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_or_b32_e32 v19, v22, v19
+; SDAG-NEXT: v_or_b32_e32 v18, v21, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v24, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v23, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v25
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_5
; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v30
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v30
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v30
; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30
; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v16
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v18
; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v25, v29
+; SDAG-NEXT: v_or_b32_e32 v18, v25, v29
; SDAG-NEXT: v_or_b32_e32 v24, v24, v28
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v38, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v38, v18, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v24, v37, v24, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v27, v18, v1, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v0, vcc
; SDAG-NEXT: v_mov_b32_e32 v24, 0
; SDAG-NEXT: v_mov_b32_e32 v25, 0
; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v27
; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v19
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
-; SDAG-NEXT: v_or_b32_e32 v22, v24, v22
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v21, v25, v21
+; SDAG-NEXT: v_or_b32_e32 v20, v24, v20
; SDAG-NEXT: v_or_b32_e32 v24, v28, v38
; SDAG-NEXT: v_or_b32_e32 v25, v26, v39
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v16
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v25
-; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v27, vcc
-; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v24, vcc
-; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v29, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16
-; SDAG-NEXT: v_and_b32_e32 v26, v16, v8
-; SDAG-NEXT: v_and_b32_e32 v28, v16, v9
-; SDAG-NEXT: v_and_b32_e32 v38, v16, v10
-; SDAG-NEXT: v_and_b32_e32 v39, v16, v11
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v18
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v25
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v24, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v18
+; SDAG-NEXT: v_and_b32_e32 v26, v18, v8
+; SDAG-NEXT: v_and_b32_e32 v28, v18, v9
+; SDAG-NEXT: v_and_b32_e32 v38, v18, v10
+; SDAG-NEXT: v_and_b32_e32 v39, v18, v11
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v18
; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v25, v26
; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v28, vcc
; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v24, v38, vcc
@@ -2614,112 +2588,108 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v24, v30, v32
; SDAG-NEXT: v_or_b32_e32 v25, v31, v33
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25]
-; SDAG-NEXT: v_or_b32_e32 v19, v21, v19
+; SDAG-NEXT: v_or_b32_e32 v17, v23, v17
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v18, v20, v18
-; SDAG-NEXT: v_mov_b32_e32 v25, v17
-; SDAG-NEXT: v_mov_b32_e32 v24, v16
+; SDAG-NEXT: v_or_b32_e32 v16, v22, v16
+; SDAG-NEXT: v_mov_b32_e32 v25, v19
+; SDAG-NEXT: v_mov_b32_e32 v24, v18
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_3
; SDAG-NEXT: ; %bb.4: ; %Flow13
; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
; SDAG-NEXT: .LBB3_5: ; %Flow14
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
-; SDAG-NEXT: v_or_b32_e32 v33, v21, v19
-; SDAG-NEXT: v_or_b32_e32 v30, v17, v23
-; SDAG-NEXT: v_or_b32_e32 v31, v20, v18
-; SDAG-NEXT: v_or_b32_e32 v32, v16, v22
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v22
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v32, v19, v21
+; SDAG-NEXT: v_or_b32_e32 v33, v18, v20
; SDAG-NEXT: .LBB3_6: ; %Flow16
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
-; SDAG-NEXT: v_or_b32_e32 v16, v12, v14
-; SDAG-NEXT: v_or_b32_e32 v19, v5, v7
-; SDAG-NEXT: v_or_b32_e32 v18, v4, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v14
-; SDAG-NEXT: v_ffbh_u32_e32 v21, v15
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v12
-; SDAG-NEXT: v_ffbh_u32_e32 v23, v13
-; SDAG-NEXT: v_ffbh_u32_e32 v24, v6
-; SDAG-NEXT: v_ffbh_u32_e32 v25, v7
-; SDAG-NEXT: v_ffbh_u32_e32 v26, v4
-; SDAG-NEXT: v_ffbh_u32_e32 v27, v5
-; SDAG-NEXT: v_mov_b32_e32 v28, 0
+; SDAG-NEXT: v_or_b32_e32 v19, v13, v15
+; SDAG-NEXT: v_or_b32_e32 v18, v12, v14
+; SDAG-NEXT: v_or_b32_e32 v21, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v20, v4, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v14
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v25, v13
+; SDAG-NEXT: v_ffbh_u32_e32 v26, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v27, v7
+; SDAG-NEXT: v_ffbh_u32_e32 v28, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v29, v5
+; SDAG-NEXT: v_mov_b32_e32 v30, 0
; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
-; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
-; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
-; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
-; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21]
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v22
+; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v24
+; SDAG-NEXT: v_add_i32_e64 v20, s[6:7], 32, v26
+; SDAG-NEXT: v_add_i32_e64 v21, s[6:7], 32, v28
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
-; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
-; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
-; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
-; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
-; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
-; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v23
+; SDAG-NEXT: v_min_u32_e32 v19, v19, v25
+; SDAG-NEXT: v_min_u32_e32 v20, v20, v27
+; SDAG-NEXT: v_min_u32_e32 v21, v21, v29
; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
-; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v21, vcc, 64, v21
+; SDAG-NEXT: v_addc_u32_e64 v23, s[4:5], 0, 0, vcc
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18
-; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc
-; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
-; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT: v_or_b32_e32 v17, v19, v21
-; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v22
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v18, v20
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v22, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v20
+; SDAG-NEXT: v_subb_u32_e32 v22, vcc, 0, v30, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v30, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v22
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v21, v23
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v24
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v7, 0, s[4:5]
; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v17, v5, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v24, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v25, v4, 0, s[4:5]
; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_12
; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v18
-; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v18
-; SDAG-NEXT: v_mov_b32_e32 v16, 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
-; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22
-; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v20, vcc
-; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v21, vcc
-; SDAG-NEXT: v_or_b32_e32 v19, v34, v36
-; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v18
-; SDAG-NEXT: v_or_b32_e32 v20, v35, v37
-; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v28
-; SDAG-NEXT: v_sub_i32_e32 v18, vcc, 64, v28
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v28
-; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
-; SDAG-NEXT: v_lshr_b64 v[18:19], v[4:5], v18
-; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
-; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
-; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v19, v23, v19, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v18, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v26, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v28
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v20
+; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v20
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v21, vcc
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[4:5], v18
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v22, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc
+; SDAG-NEXT: v_or_b32_e32 v21, v34, v36
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 0x7f, v20
+; SDAG-NEXT: v_or_b32_e32 v22, v35, v37
+; SDAG-NEXT: v_lshl_b64 v[23:24], v[6:7], v27
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 64, v27
+; SDAG-NEXT: v_lshl_b64 v[25:26], v[4:5], v27
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[21:22]
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v20
+; SDAG-NEXT: v_or_b32_e32 v21, v24, v21
+; SDAG-NEXT: v_or_b32_e32 v20, v23, v20
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v27
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v25, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v27
; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, v7, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v6, s[4:5]
; SDAG-NEXT: v_mov_b32_e32 v22, 0
@@ -2728,60 +2698,60 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
; SDAG-NEXT: s_cbranch_execz .LBB3_11
; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT: v_lshr_b64 v[24:25], v[4:5], v34
-; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v34
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[4:5], v34
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, 64, v34
; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34
-; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34
+; SDAG-NEXT: v_lshr_b64 v[28:29], v[6:7], v34
; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12
-; SDAG-NEXT: v_mov_b32_e32 v22, 0
-; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mov_b32_e32 v25, 0
; SDAG-NEXT: s_mov_b64 s[10:11], 0
-; SDAG-NEXT: v_mov_b32_e32 v17, 0
-; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v16
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_lshl_b64 v[30:31], v[6:7], v22
; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39
; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc
-; SDAG-NEXT: v_or_b32_e32 v16, v25, v29
-; SDAG-NEXT: v_or_b32_e32 v24, v24, v28
+; SDAG-NEXT: v_or_b32_e32 v22, v27, v31
+; SDAG-NEXT: v_or_b32_e32 v26, v26, v30
; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc
; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT: v_cndmask_b32_e64 v16, v50, v16, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v24, v49, v24, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v50, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v26, v49, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v31, 0, v29, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v30, 0, v28, s[4:5]
; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc
; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
-; SDAG-NEXT: v_cndmask_b32_e32 v27, v16, v5, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v26, v24, v4, vcc
-; SDAG-NEXT: v_mov_b32_e32 v24, 0
-; SDAG-NEXT: v_mov_b32_e32 v25, 0
+; SDAG-NEXT: v_cndmask_b32_e32 v29, v22, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v28, v26, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v26, 0
+; SDAG-NEXT: v_mov_b32_e32 v27, 0
; SDAG-NEXT: .LBB3_9: ; %udiv-do-while
; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[30:31], v[30:31], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v29
; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
-; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v27
-; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19
; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT: v_or_b32_e32 v16, v28, v16
-; SDAG-NEXT: v_or_b32_e32 v26, v26, v50
+; SDAG-NEXT: v_or_b32_e32 v22, v30, v22
+; SDAG-NEXT: v_or_b32_e32 v28, v28, v50
; SDAG-NEXT: v_or_b32_e32 v18, v18, v51
-; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
-; SDAG-NEXT: v_or_b32_e32 v21, v25, v21
-; SDAG-NEXT: v_sub_i32_e32 v25, vcc, v38, v26
-; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v39, v27, vcc
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v48, v16, vcc
-; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v49, v29, vcc
-; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v25
-; SDAG-NEXT: v_and_b32_e32 v28, v25, v12
-; SDAG-NEXT: v_and_b32_e32 v50, v25, v13
-; SDAG-NEXT: v_and_b32_e32 v51, v25, v14
-; SDAG-NEXT: v_and_b32_e32 v52, v25, v15
-; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v28
-; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v50, vcc
-; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v16, v51, vcc
-; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v52, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v25, v19
+; SDAG-NEXT: v_or_b32_e32 v21, v27, v21
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, v38, v28
+; SDAG-NEXT: v_or_b32_e32 v18, v24, v18
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v39, v29, vcc
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v48, v22, vcc
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v49, v31, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v27, 31, v27
+; SDAG-NEXT: v_and_b32_e32 v30, v27, v12
+; SDAG-NEXT: v_and_b32_e32 v50, v27, v13
+; SDAG-NEXT: v_and_b32_e32 v51, v27, v14
+; SDAG-NEXT: v_and_b32_e32 v52, v27, v15
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v28, v30
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc
+; SDAG-NEXT: v_subb_u32_e32 v30, vcc, v22, v51, vcc
+; SDAG-NEXT: v_subb_u32_e32 v31, vcc, v31, v52, vcc
; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34
; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc
@@ -2789,11 +2759,11 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_or_b32_e32 v50, v34, v36
; SDAG-NEXT: v_or_b32_e32 v51, v35, v37
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51]
-; SDAG-NEXT: v_and_b32_e32 v16, 1, v25
+; SDAG-NEXT: v_and_b32_e32 v22, 1, v27
; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT: v_or_b32_e32 v20, v24, v20
-; SDAG-NEXT: v_mov_b32_e32 v25, v17
-; SDAG-NEXT: v_mov_b32_e32 v24, v16
+; SDAG-NEXT: v_or_b32_e32 v20, v26, v20
+; SDAG-NEXT: v_mov_b32_e32 v27, v23
+; SDAG-NEXT: v_mov_b32_e32 v26, v22
; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
; SDAG-NEXT: s_cbranch_execnz .LBB3_9
; SDAG-NEXT: ; %bb.10: ; %Flow
@@ -2804,56 +2774,54 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v21
; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
-; SDAG-NEXT: v_or_b32_e32 v23, v23, v19
-; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
-; SDAG-NEXT: v_or_b32_e32 v22, v22, v18
-; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_or_b32_e32 v24, v23, v21
+; SDAG-NEXT: v_or_b32_e32 v25, v22, v20
; SDAG-NEXT: .LBB3_12: ; %Flow12
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT: v_mul_lo_u32 v21, v32, v11
-; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v32, v10, 0
-; SDAG-NEXT: v_mul_lo_u32 v26, v30, v10
-; SDAG-NEXT: v_mul_lo_u32 v27, v33, v8
-; SDAG-NEXT: v_mul_lo_u32 v28, v31, v9
-; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v32, 0
-; SDAG-NEXT: v_mov_b32_e32 v20, 0
-; SDAG-NEXT: v_mul_lo_u32 v29, v16, v15
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
-; SDAG-NEXT: v_mul_lo_u32 v33, v17, v14
-; SDAG-NEXT: v_mul_lo_u32 v34, v23, v12
-; SDAG-NEXT: v_mul_lo_u32 v35, v22, v13
-; SDAG-NEXT: v_add_i32_e32 v21, vcc, v25, v21
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v32, v[19:20]
-; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v18
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v29
-; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v21, v26
-; SDAG-NEXT: v_mov_b32_e32 v19, v14
-; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v8, v30, v[19:20]
-; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v33
-; SDAG-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v31, v8, v[24:25]
-; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v19
+; SDAG-NEXT: v_mul_lo_u32 v23, v33, v11
+; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v33, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v28, v32, v10
+; SDAG-NEXT: v_mul_lo_u32 v29, v17, v8
+; SDAG-NEXT: v_mul_lo_u32 v30, v16, v9
+; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v8, v33, 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mul_lo_u32 v17, v25, v15
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v25, v14, 0
+; SDAG-NEXT: v_mul_lo_u32 v31, v24, v14
+; SDAG-NEXT: v_mul_lo_u32 v34, v19, v12
+; SDAG-NEXT: v_mul_lo_u32 v35, v18, v13
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, v27, v23
+; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v9, v33, v[21:22]
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v20
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v17
+; SDAG-NEXT: v_add_i32_e64 v27, s[4:5], v19, v28
+; SDAG-NEXT: v_mov_b32_e32 v21, v14
+; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v8, v32, v[21:22]
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v11, v31
+; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v16, v8, v[26:27]
+; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v15, v20
; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v18, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[10:11]
-; SDAG-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v16, 0
-; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v27, v24
-; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[14:15]
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v19, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v18, v12, v[10:11]
+; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v25, 0
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v29, v17
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v32, v[14:15]
; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v34, v11
-; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v16, v[19:20]
-; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v28, v21
-; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v35, v11
-; SDAG-NEXT: v_mov_b32_e32 v19, v14
-; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v17, v[19:20]
-; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v23
-; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v16, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v13, v25, v[21:22]
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v30, v17
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v35, v11
+; SDAG-NEXT: v_mov_b32_e32 v21, v14
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v12, v24, v[21:22]
+; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v16
+; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v17, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc
; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v15, v12
; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v17, v[8:9]
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v24, v[8:9]
; SDAG-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc
-; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v18
+; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v9, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v20
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v11, vcc
; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc
; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index b761f689d6af5..4e1f0c0538bb5 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -70,23 +70,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; GFX9-NEXT: v_or_b32_e32 v12, v7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT: v_or_b32_e32 v11, v11, v8
+; GFX9-NEXT: v_xor_b32_e32 v10, 0x7f, v6
+; GFX9-NEXT: v_or_b32_e32 v11, v7, v9
+; GFX9-NEXT: v_or_b32_e32 v10, v10, v8
; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12]
+; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v13, v3, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v10, v0, 0, s[4:5]
; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_6
@@ -107,38 +107,36 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v8, v10, v12
; GFX9-NEXT: v_or_b32_e32 v9, v9, v11
; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v13
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], v13, v[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-NEXT: v_mov_b32_e32 v12, 0
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], v13, v[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v13
+; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v9, s[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; GFX9-NEXT: s_cbranch_execz .LBB0_5
; GFX9-NEXT: ; %bb.2: ; %udiv-preheader
; GFX9-NEXT: v_sub_u32_e32 v12, 64, v24
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[0:1]
; GFX9-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3]
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
-; GFX9-NEXT: v_or_b32_e32 v12, v8, v12
-; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v24
-; GFX9-NEXT: v_or_b32_e32 v13, v9, v13
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[2:3]
+; GFX9-NEXT: v_or_b32_e32 v12, v10, v12
+; GFX9-NEXT: v_subrev_u32_e32 v10, 64, v24
+; GFX9-NEXT: v_or_b32_e32 v13, v11, v13
+; GFX9-NEXT: v_lshrrev_b64 v[10:11], v10, v[2:3]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v15, v9, v1, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc
-; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[2:3]
+; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v15, v11, v1, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v12, vcc
+; GFX9-NEXT: v_lshrrev_b64 v[10:11], v24, v[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v14, v12, v0, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v8, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v11, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v10, vcc
; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, -1, v23
; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, -1, v22, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, -1, v4, vcc
@@ -148,32 +146,32 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_mov_b32_e32 v19, 0
-; GFX9-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: .LBB0_3: ; %udiv-do-while
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_lshrrev_b32_e32 v32, 31, v15
; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[14:15]
; GFX9-NEXT: v_lshrrev_b32_e32 v33, 31, v7
; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v9
; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[16:17]
; GFX9-NEXT: v_or_b32_e32 v14, v14, v33
-; GFX9-NEXT: v_or3_b32 v6, v6, v8, v12
-; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v28, v14
+; GFX9-NEXT: v_or3_b32 v6, v6, v10, v12
+; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, v28, v14
; GFX9-NEXT: v_or_b32_e32 v16, v16, v32
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v29, v15, vcc
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v30, v16, vcc
-; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[10:11]
-; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v31, v17, vcc
-; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT: v_or_b32_e32 v10, v18, v10
-; GFX9-NEXT: v_and_b32_e32 v18, v8, v23
-; GFX9-NEXT: v_or_b32_e32 v11, v19, v11
-; GFX9-NEXT: v_and_b32_e32 v19, v8, v22
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v29, v15, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v30, v16, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v31, v17, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v10, 31, v10
+; GFX9-NEXT: v_or_b32_e32 v8, v18, v8
+; GFX9-NEXT: v_and_b32_e32 v18, v10, v23
+; GFX9-NEXT: v_or_b32_e32 v9, v19, v9
+; GFX9-NEXT: v_and_b32_e32 v19, v10, v22
; GFX9-NEXT: v_sub_co_u32_e32 v14, vcc, v14, v18
-; GFX9-NEXT: v_and_b32_e32 v32, v8, v4
+; GFX9-NEXT: v_and_b32_e32 v32, v10, v4
; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, v15, v19, vcc
-; GFX9-NEXT: v_and_b32_e32 v33, v8, v5
+; GFX9-NEXT: v_and_b32_e32 v33, v10, v5
; GFX9-NEXT: v_subb_co_u32_e32 v16, vcc, v16, v32, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v17, v33, vcc
; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, -1, v24
@@ -183,42 +181,41 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_or_b32_e32 v18, v24, v26
; GFX9-NEXT: v_or_b32_e32 v19, v25, v27
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
-; GFX9-NEXT: v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT: v_mov_b32_e32 v19, v9
+; GFX9-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX9-NEXT: v_mov_b32_e32 v19, v11
; GFX9-NEXT: v_or3_b32 v7, v7, 0, v13
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT: v_mov_b32_e32 v18, v8
+; GFX9-NEXT: v_mov_b32_e32 v18, v10
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_cbranch_execnz .LBB0_3
; GFX9-NEXT: ; %bb.4: ; %Flow
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB0_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[10:11]
-; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 31, v11
-; GFX9-NEXT: v_or3_b32 v10, v7, 0, v13
-; GFX9-NEXT: v_or3_b32 v12, v6, v11, v12
-; GFX9-NEXT: v_or_b32_e32 v11, v9, v15
-; GFX9-NEXT: v_or_b32_e32 v13, v8, v14
+; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[6:7]
+; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[8:9]
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 31, v9
+; GFX9-NEXT: v_or_b32_e32 v12, v12, v14
+; GFX9-NEXT: v_or_b32_e32 v11, v11, v7
+; GFX9-NEXT: v_or_b32_e32 v10, v10, v6
; GFX9-NEXT: .LBB0_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT: v_mul_lo_u32 v16, v13, v5
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v13, 0
+; GFX9-NEXT: v_mul_lo_u32 v17, v10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v23, v10, 0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[6:7]
-; GFX9-NEXT: v_mul_lo_u32 v15, v11, v4
-; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22
-; GFX9-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-NEXT: v_mul_lo_u32 v16, v11, v4
+; GFX9-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v22, v10, v[6:7]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, v14
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v23, v11, v[6:7]
-; GFX9-NEXT: v_add3_u32 v9, v9, v16, v15
+; GFX9-NEXT: v_add3_u32 v9, v9, v17, v16
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v23, v[8:9]
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v7
+; GFX9-NEXT: v_mul_lo_u32 v4, v12, v22
+; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v15, v7
+; GFX9-NEXT: v_mul_lo_u32 v14, v13, v23
; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc
-; GFX9-NEXT: v_mul_lo_u32 v15, v10, v23
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v11, v[12:13]
-; GFX9-NEXT: v_add3_u32 v4, v15, v9, v4
+; GFX9-NEXT: v_add3_u32 v4, v14, v9, v4
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v10, v8
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5
@@ -1475,12 +1472,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v15
; GFX9-NEXT: v_mov_b32_e32 v12, 0
-; GFX9-NEXT: v_mov_b32_e32 v14, 0
; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v11, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_mov_b32_e32 v15, 0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -1557,13 +1552,12 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: .LBB1_5: ; %Flow2
; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[10:11]
-; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 31, v11
-; GFX9-NEXT: v_or3_b32 v15, v9, 0, v15
-; GFX9-NEXT: v_or3_b32 v14, v8, v10, v14
-; GFX9-NEXT: v_or_b32_e32 v13, v13, v17
-; GFX9-NEXT: v_or_b32_e32 v12, v12, v16
+; GFX9-NEXT: v_lshlrev_b64 v[14:15], 1, v[8:9]
+; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[10:11]
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 31, v11
+; GFX9-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX9-NEXT: v_or_b32_e32 v13, v13, v9
+; GFX9-NEXT: v_or_b32_e32 v12, v12, v8
; GFX9-NEXT: .LBB1_6: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: v_mul_lo_u32 v19, v12, v7
diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll
index bf42e9f1104b4..ae76452b4d2cd 100644
--- a/llvm/test/CodeGen/ARM/cttz.ll
+++ b/llvm/test/CodeGen/ARM/cttz.ll
@@ -263,47 +263,48 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-6M-LABEL: test_i64:
; CHECK-6M: @ %bb.0:
-; CHECK-6M-NEXT: .save {r4, r5, r6, lr}
-; CHECK-6M-NEXT: push {r4, r5, r6, lr}
-; CHECK-6M-NEXT: mov r3, r1
+; CHECK-6M-NEXT: .save {r4, r5, r7, lr}
+; CHECK-6M-NEXT: push {r4, r5, r7, lr}
; CHECK-6M-NEXT: mov r2, r0
-; CHECK-6M-NEXT: movs r1, #0
-; CHECK-6M-NEXT: orrs r0, r3
+; CHECK-6M-NEXT: orrs r0, r1
; CHECK-6M-NEXT: beq .LBB3_6
; CHECK-6M-NEXT: @ %bb.1: @ %cond.false
-; CHECK-6M-NEXT: ldr r6, .LCPI3_0
-; CHECK-6M-NEXT: adr r4, .LCPI3_1
+; CHECK-6M-NEXT: ldr r5, .LCPI3_0
+; CHECK-6M-NEXT: adr r3, .LCPI3_1
; CHECK-6M-NEXT: movs r0, #32
-; CHECK-6M-NEXT: cmp r3, #0
-; CHECK-6M-NEXT: mov r5, r0
+; CHECK-6M-NEXT: cmp r1, #0
+; CHECK-6M-NEXT: mov r4, r0
; CHECK-6M-NEXT: beq .LBB3_3
; CHECK-6M-NEXT: @ %bb.2: @ %cond.false
-; CHECK-6M-NEXT: rsbs r5, r3, #0
-; CHECK-6M-NEXT: ands r5, r3
-; CHECK-6M-NEXT: muls r5, r6, r5
-; CHECK-6M-NEXT: lsrs r3, r5, #27
-; CHECK-6M-NEXT: ldrb r5, [r4, r3]
+; CHECK-6M-NEXT: rsbs r4, r1, #0
+; CHECK-6M-NEXT: ands r4, r1
+; CHECK-6M-NEXT: muls r4, r5, r4
+; CHECK-6M-NEXT: lsrs r1, r4, #27
+; CHECK-6M-NEXT: ldrb r4, [r3, r1]
; CHECK-6M-NEXT: .LBB3_3: @ %cond.false
-; CHECK-6M-NEXT: adds r5, #32
-; CHECK-6M-NEXT: rsbs r3, r2, #0
-; CHECK-6M-NEXT: ands r3, r2
-; CHECK-6M-NEXT: muls r6, r3, r6
-; CHECK-6M-NEXT: lsrs r3, r6, #27
+; CHECK-6M-NEXT: adds r4, #32
+; CHECK-6M-NEXT: rsbs r1, r2, #0
+; CHECK-6M-NEXT: ands r1, r2
+; CHECK-6M-NEXT: muls r5, r1, r5
+; CHECK-6M-NEXT: lsrs r1, r5, #27
; CHECK-6M-NEXT: cmp r2, #0
; CHECK-6M-NEXT: bne .LBB3_7
; CHECK-6M-NEXT: @ %bb.4: @ %cond.false
; CHECK-6M-NEXT: beq .LBB3_8
; CHECK-6M-NEXT: .LBB3_5: @ %cond.end
-; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT: movs r1, #0
+; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
; CHECK-6M-NEXT: .LBB3_6:
; CHECK-6M-NEXT: movs r0, #64
-; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT: movs r1, #0
+; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
; CHECK-6M-NEXT: .LBB3_7: @ %cond.false
-; CHECK-6M-NEXT: ldrb r0, [r4, r3]
+; CHECK-6M-NEXT: ldrb r0, [r3, r1]
; CHECK-6M-NEXT: bne .LBB3_5
; CHECK-6M-NEXT: .LBB3_8: @ %cond.false
-; CHECK-6M-NEXT: mov r0, r5
-; CHECK-6M-NEXT: pop {r4, r5, r6, pc}
+; CHECK-6M-NEXT: mov r0, r4
+; CHECK-6M-NEXT: movs r1, #0
+; CHECK-6M-NEXT: pop {r4, r5, r7, pc}
; CHECK-6M-NEXT: .p2align 2
; CHECK-6M-NEXT: @ %bb.9:
; CHECK-6M-NEXT: .LCPI3_0:
@@ -313,47 +314,48 @@ define i64 @test_i64(i64 %a) {
;
; CHECK-8MBASE-LABEL: test_i64:
; CHECK-8MBASE: @ %bb.0:
-; CHECK-8MBASE-NEXT: .save {r4, r5, r6, lr}
-; CHECK-8MBASE-NEXT: push {r4, r5, r6, lr}
-; CHECK-8MBASE-NEXT: mov r3, r1
+; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr}
+; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr}
; CHECK-8MBASE-NEXT: mov r2, r0
-; CHECK-8MBASE-NEXT: movs r1, #0
-; CHECK-8MBASE-NEXT: orrs r0, r3
+; CHECK-8MBASE-NEXT: orrs r0, r1
; CHECK-8MBASE-NEXT: beq .LBB3_6
; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false
-; CHECK-8MBASE-NEXT: movw r6, #46385
-; CHECK-8MBASE-NEXT: movt r6, #1916
-; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0
+; CHECK-8MBASE-NEXT: movw r5, #46385
+; CHECK-8MBASE-NEXT: movt r5, #1916
+; CHECK-8MBASE-NEXT: adr r3, .LCPI3_0
; CHECK-8MBASE-NEXT: movs r0, #32
-; CHECK-8MBASE-NEXT: mov r5, r0
-; CHECK-8MBASE-NEXT: cbz r3, .LBB3_3
+; CHECK-8MBASE-NEXT: mov r4, r0
+; CHECK-8MBASE-NEXT: cbz r1, .LBB3_3
; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false
-; CHECK-8MBASE-NEXT: rsbs r5, r3, #0
-; CHECK-8MBASE-NEXT: ands r5, r3
-; CHECK-8MBASE-NEXT: muls r5, r6, r5
-; CHECK-8MBASE-NEXT: lsrs r3, r5, #27
-; CHECK-8MBASE-NEXT: ldrb r5, [r4, r3]
+; CHECK-8MBASE-NEXT: rsbs r4, r1, #0
+; CHECK-8MBASE-NEXT: ands r4, r1
+; CHECK-8MBASE-NEXT: muls r4, r5, r4
+; CHECK-8MBASE-NEXT: lsrs r1, r4, #27
+; CHECK-8MBASE-NEXT: ldrb r4, [r3, r1]
; CHECK-8MBASE-NEXT: .LBB3_3: @ %cond.false
-; CHECK-8MBASE-NEXT: adds r5, #32
-; CHECK-8MBASE-NEXT: rsbs r3, r2, #0
-; CHECK-8MBASE-NEXT: ands r3, r2
-; CHECK-8MBASE-NEXT: muls r6, r3, r6
-; CHECK-8MBASE-NEXT: lsrs r3, r6, #27
+; CHECK-8MBASE-NEXT: adds r4, #32
+; CHECK-8MBASE-NEXT: rsbs r1, r2, #0
+; CHECK-8MBASE-NEXT: ands r1, r2
+; CHECK-8MBASE-NEXT: muls r5, r1, r5
+; CHECK-8MBASE-NEXT: lsrs r1, r5, #27
; CHECK-8MBASE-NEXT: cmp r2, #0
; CHECK-8MBASE-NEXT: bne .LBB3_7
; CHECK-8MBASE-NEXT: @ %bb.4: @ %cond.false
; CHECK-8MBASE-NEXT: beq .LBB3_8
; CHECK-8MBASE-NEXT: .LBB3_5: @ %cond.end
-; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT: movs r1, #0
+; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
; CHECK-8MBASE-NEXT: .LBB3_6:
; CHECK-8MBASE-NEXT: movs r0, #64
-; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT: movs r1, #0
+; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
; CHECK-8MBASE-NEXT: .LBB3_7: @ %cond.false
-; CHECK-8MBASE-NEXT: ldrb r0, [r4, r3]
+; CHECK-8MBASE-NEXT: ldrb r0, [r3, r1]
; CHECK-8MBASE-NEXT: bne .LBB3_5
; CHECK-8MBASE-NEXT: .LBB3_8: @ %cond.false
-; CHECK-8MBASE-NEXT: mov r0, r5
-; CHECK-8MBASE-NEXT: pop {r4, r5, r6, pc}
+; CHECK-8MBASE-NEXT: mov r0, r4
+; CHECK-8MBASE-NEXT: movs r1, #0
+; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc}
; CHECK-8MBASE-NEXT: .p2align 2
; CHECK-8MBASE-NEXT: @ %bb.9:
; CHECK-8MBASE-NEXT: .LCPI3_0:
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index cdbbabe3e3b05..5726c2a5bbb16 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -42,90 +42,89 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd23, %r4;
; CHECK-NEXT: add.s64 %rd24, %rd23, 64;
; CHECK-NEXT: selp.b64 %rd25, %rd22, %rd24, %p7;
-; CHECK-NEXT: mov.b64 %rd70, 0;
-; CHECK-NEXT: sub.cc.s64 %rd26, %rd21, %rd25;
-; CHECK-NEXT: subc.cc.s64 %rd27, %rd70, 0;
-; CHECK-NEXT: setp.gt.u64 %p8, %rd26, 127;
-; CHECK-NEXT: setp.eq.b64 %p9, %rd27, 0;
+; CHECK-NEXT: mov.b64 %rd26, 0;
+; CHECK-NEXT: sub.cc.s64 %rd27, %rd21, %rd25;
+; CHECK-NEXT: subc.cc.s64 %rd28, %rd26, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd27, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd28, 0;
; CHECK-NEXT: and.pred %p10, %p9, %p8;
-; CHECK-NEXT: setp.ne.b64 %p11, %rd27, 0;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd28, 0;
; CHECK-NEXT: or.pred %p12, %p10, %p11;
; CHECK-NEXT: or.pred %p13, %p5, %p12;
-; CHECK-NEXT: xor.b64 %rd28, %rd26, 127;
-; CHECK-NEXT: or.b64 %rd29, %rd28, %rd27;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd29, 0;
+; CHECK-NEXT: xor.b64 %rd29, %rd27, 127;
+; CHECK-NEXT: or.b64 %rd30, %rd29, %rd28;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd30, 0;
; CHECK-NEXT: selp.b64 %rd78, 0, %rd3, %p13;
; CHECK-NEXT: selp.b64 %rd77, 0, %rd2, %p13;
; CHECK-NEXT: or.pred %p15, %p13, %p14;
; CHECK-NEXT: @%p15 bra $L__BB0_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd71, %rd26, 1;
-; CHECK-NEXT: addc.cc.s64 %rd72, %rd27, 0;
-; CHECK-NEXT: or.b64 %rd30, %rd71, %rd72;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd30, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd26;
+; CHECK-NEXT: add.cc.s64 %rd71, %rd27, 1;
+; CHECK-NEXT: addc.cc.s64 %rd72, %rd28, 0;
+; CHECK-NEXT: or.b64 %rd31, %rd71, %rd72;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd27;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd31, %rd3, %r6;
+; CHECK-NEXT: shl.b64 %rd32, %rd3, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd32, %rd2, %r7;
-; CHECK-NEXT: or.b64 %rd33, %rd31, %rd32;
+; CHECK-NEXT: shr.u64 %rd33, %rd2, %r7;
+; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd34, %rd2, %r8;
+; CHECK-NEXT: shl.b64 %rd35, %rd2, %r8;
; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd76, %rd34, %rd33, %p17;
+; CHECK-NEXT: selp.b64 %rd76, %rd35, %rd34, %p17;
; CHECK-NEXT: shl.b64 %rd75, %rd2, %r6;
+; CHECK-NEXT: mov.b64 %rd70, 0;
; CHECK-NEXT: mov.b64 %rd69, %rd70;
; CHECK-NEXT: @%p16 bra $L__BB0_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd71;
-; CHECK-NEXT: shr.u64 %rd35, %rd2, %r9;
+; CHECK-NEXT: shr.u64 %rd36, %rd2, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd36, %rd3, %r10;
-; CHECK-NEXT: or.b64 %rd37, %rd35, %rd36;
+; CHECK-NEXT: shl.b64 %rd37, %rd3, %r10;
+; CHECK-NEXT: or.b64 %rd38, %rd36, %rd37;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd38, %rd3, %r11;
+; CHECK-NEXT: shr.u64 %rd39, %rd3, %r11;
; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd73, %rd38, %rd37, %p18;
+; CHECK-NEXT: selp.b64 %rd73, %rd39, %rd38, %p18;
; CHECK-NEXT: shr.u64 %rd74, %rd3, %r9;
; CHECK-NEXT: add.cc.s64 %rd6, %rd4, -1;
; CHECK-NEXT: addc.cc.s64 %rd7, %rd5, -1;
-; CHECK-NEXT: mov.b64 %rd69, 0;
-; CHECK-NEXT: mov.b64 %rd70, %rd69;
+; CHECK-NEXT: mov.b64 %rd69, %rd70;
; CHECK-NEXT: $L__BB0_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd39, %rd73, 63;
-; CHECK-NEXT: shl.b64 %rd40, %rd74, 1;
-; CHECK-NEXT: or.b64 %rd41, %rd40, %rd39;
-; CHECK-NEXT: shl.b64 %rd42, %rd73, 1;
-; CHECK-NEXT: shr.u64 %rd43, %rd76, 63;
-; CHECK-NEXT: or.b64 %rd44, %rd42, %rd43;
-; CHECK-NEXT: shr.u64 %rd45, %rd75, 63;
-; CHECK-NEXT: shl.b64 %rd46, %rd76, 1;
-; CHECK-NEXT: or.b64 %rd47, %rd46, %rd45;
-; CHECK-NEXT: shl.b64 %rd48, %rd75, 1;
-; CHECK-NEXT: or.b64 %rd75, %rd70, %rd48;
-; CHECK-NEXT: or.b64 %rd76, %rd69, %rd47;
-; CHECK-NEXT: sub.cc.s64 %rd49, %rd6, %rd44;
-; CHECK-NEXT: subc.cc.s64 %rd50, %rd7, %rd41;
-; CHECK-NEXT: shr.s64 %rd51, %rd50, 63;
-; CHECK-NEXT: and.b64 %rd70, %rd51, 1;
-; CHECK-NEXT: and.b64 %rd52, %rd51, %rd4;
-; CHECK-NEXT: and.b64 %rd53, %rd51, %rd5;
-; CHECK-NEXT: sub.cc.s64 %rd73, %rd44, %rd52;
-; CHECK-NEXT: subc.cc.s64 %rd74, %rd41, %rd53;
+; CHECK-NEXT: shr.u64 %rd40, %rd73, 63;
+; CHECK-NEXT: shl.b64 %rd41, %rd74, 1;
+; CHECK-NEXT: or.b64 %rd42, %rd41, %rd40;
+; CHECK-NEXT: shl.b64 %rd43, %rd73, 1;
+; CHECK-NEXT: shr.u64 %rd44, %rd76, 63;
+; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44;
+; CHECK-NEXT: shr.u64 %rd46, %rd75, 63;
+; CHECK-NEXT: shl.b64 %rd47, %rd76, 1;
+; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT: shl.b64 %rd49, %rd75, 1;
+; CHECK-NEXT: or.b64 %rd75, %rd69, %rd49;
+; CHECK-NEXT: or.b64 %rd76, %rd70, %rd48;
+; CHECK-NEXT: sub.cc.s64 %rd50, %rd6, %rd45;
+; CHECK-NEXT: subc.cc.s64 %rd51, %rd7, %rd42;
+; CHECK-NEXT: shr.s64 %rd52, %rd51, 63;
+; CHECK-NEXT: and.b64 %rd69, %rd52, 1;
+; CHECK-NEXT: and.b64 %rd53, %rd52, %rd4;
+; CHECK-NEXT: and.b64 %rd54, %rd52, %rd5;
+; CHECK-NEXT: sub.cc.s64 %rd73, %rd45, %rd53;
+; CHECK-NEXT: subc.cc.s64 %rd74, %rd42, %rd54;
; CHECK-NEXT: add.cc.s64 %rd71, %rd71, -1;
; CHECK-NEXT: addc.cc.s64 %rd72, %rd72, -1;
-; CHECK-NEXT: or.b64 %rd54, %rd71, %rd72;
-; CHECK-NEXT: setp.eq.b64 %p19, %rd54, 0;
+; CHECK-NEXT: or.b64 %rd55, %rd71, %rd72;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd55, 0;
; CHECK-NEXT: @%p19 bra $L__BB0_4;
; CHECK-NEXT: bra.uni $L__BB0_2;
; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd55, %rd75, 63;
-; CHECK-NEXT: shl.b64 %rd56, %rd76, 1;
-; CHECK-NEXT: or.b64 %rd57, %rd56, %rd55;
+; CHECK-NEXT: shr.u64 %rd56, %rd75, 63;
+; CHECK-NEXT: shl.b64 %rd57, %rd76, 1;
+; CHECK-NEXT: or.b64 %rd78, %rd57, %rd56;
; CHECK-NEXT: shl.b64 %rd58, %rd75, 1;
-; CHECK-NEXT: or.b64 %rd77, %rd70, %rd58;
-; CHECK-NEXT: or.b64 %rd78, %rd69, %rd57;
+; CHECK-NEXT: or.b64 %rd77, %rd69, %rd58;
; CHECK-NEXT: $L__BB0_5: // %udiv-end
; CHECK-NEXT: mul.hi.u64 %rd59, %rd4, %rd77;
; CHECK-NEXT: mad.lo.s64 %rd60, %rd4, %rd78, %rd59;
@@ -172,90 +171,89 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd14, %r4;
; CHECK-NEXT: add.s64 %rd15, %rd14, 64;
; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5;
-; CHECK-NEXT: mov.b64 %rd57, 0;
-; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16;
-; CHECK-NEXT: subc.cc.s64 %rd18, %rd57, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0;
+; CHECK-NEXT: mov.b64 %rd17, 0;
+; CHECK-NEXT: sub.cc.s64 %rd18, %rd12, %rd16;
+; CHECK-NEXT: subc.cc.s64 %rd19, %rd17, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd18, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd19, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd19, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd19, %rd17, 127;
-; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0;
+; CHECK-NEXT: xor.b64 %rd20, %rd18, 127;
+; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd21, 0;
; CHECK-NEXT: selp.b64 %rd65, 0, %rd6, %p11;
; CHECK-NEXT: selp.b64 %rd64, 0, %rd5, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB1_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd58, %rd17, 1;
-; CHECK-NEXT: addc.cc.s64 %rd59, %rd18, 0;
-; CHECK-NEXT: or.b64 %rd21, %rd58, %rd59;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd17;
+; CHECK-NEXT: add.cc.s64 %rd58, %rd18, 1;
+; CHECK-NEXT: addc.cc.s64 %rd59, %rd19, 0;
+; CHECK-NEXT: or.b64 %rd22, %rd58, %rd59;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd22, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd18;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd22, %rd6, %r6;
+; CHECK-NEXT: shl.b64 %rd23, %rd6, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd23, %rd5, %r7;
-; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23;
+; CHECK-NEXT: shr.u64 %rd24, %rd5, %r7;
+; CHECK-NEXT: or.b64 %rd25, %rd23, %rd24;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd25, %rd5, %r8;
+; CHECK-NEXT: shl.b64 %rd26, %rd5, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd63, %rd25, %rd24, %p15;
+; CHECK-NEXT: selp.b64 %rd63, %rd26, %rd25, %p15;
; CHECK-NEXT: shl.b64 %rd62, %rd5, %r6;
+; CHECK-NEXT: mov.b64 %rd57, 0;
; CHECK-NEXT: mov.b64 %rd56, %rd57;
; CHECK-NEXT: @%p14 bra $L__BB1_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd58;
-; CHECK-NEXT: shr.u64 %rd26, %rd5, %r9;
+; CHECK-NEXT: shr.u64 %rd27, %rd5, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd27, %rd6, %r10;
-; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27;
+; CHECK-NEXT: shl.b64 %rd28, %rd6, %r10;
+; CHECK-NEXT: or.b64 %rd29, %rd27, %rd28;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd29, %rd6, %r11;
+; CHECK-NEXT: shr.u64 %rd30, %rd6, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd60, %rd29, %rd28, %p16;
+; CHECK-NEXT: selp.b64 %rd60, %rd30, %rd29, %p16;
; CHECK-NEXT: shr.u64 %rd61, %rd6, %r9;
; CHECK-NEXT: add.cc.s64 %rd3, %rd1, -1;
; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, -1;
-; CHECK-NEXT: mov.b64 %rd56, 0;
-; CHECK-NEXT: mov.b64 %rd57, %rd56;
+; CHECK-NEXT: mov.b64 %rd56, %rd57;
; CHECK-NEXT: $L__BB1_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd30, %rd60, 63;
-; CHECK-NEXT: shl.b64 %rd31, %rd61, 1;
-; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30;
-; CHECK-NEXT: shl.b64 %rd33, %rd60, 1;
-; CHECK-NEXT: shr.u64 %rd34, %rd63, 63;
-; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34;
-; CHECK-NEXT: shr.u64 %rd36, %rd62, 63;
-; CHECK-NEXT: shl.b64 %rd37, %rd63, 1;
-; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36;
-; CHECK-NEXT: shl.b64 %rd39, %rd62, 1;
-; CHECK-NEXT: or.b64 %rd62, %rd57, %rd39;
-; CHECK-NEXT: or.b64 %rd63, %rd56, %rd38;
-; CHECK-NEXT: sub.cc.s64 %rd40, %rd3, %rd35;
-; CHECK-NEXT: subc.cc.s64 %rd41, %rd4, %rd32;
-; CHECK-NEXT: shr.s64 %rd42, %rd41, 63;
-; CHECK-NEXT: and.b64 %rd57, %rd42, 1;
-; CHECK-NEXT: and.b64 %rd43, %rd42, %rd1;
-; CHECK-NEXT: and.b64 %rd44, %rd42, %rd2;
-; CHECK-NEXT: sub.cc.s64 %rd60, %rd35, %rd43;
-; CHECK-NEXT: subc.cc.s64 %rd61, %rd32, %rd44;
+; CHECK-NEXT: shr.u64 %rd31, %rd60, 63;
+; CHECK-NEXT: shl.b64 %rd32, %rd61, 1;
+; CHECK-NEXT: or.b64 %rd33, %rd32, %rd31;
+; CHECK-NEXT: shl.b64 %rd34, %rd60, 1;
+; CHECK-NEXT: shr.u64 %rd35, %rd63, 63;
+; CHECK-NEXT: or.b64 %rd36, %rd34, %rd35;
+; CHECK-NEXT: shr.u64 %rd37, %rd62, 63;
+; CHECK-NEXT: shl.b64 %rd38, %rd63, 1;
+; CHECK-NEXT: or.b64 %rd39, %rd38, %rd37;
+; CHECK-NEXT: shl.b64 %rd40, %rd62, 1;
+; CHECK-NEXT: or.b64 %rd62, %rd56, %rd40;
+; CHECK-NEXT: or.b64 %rd63, %rd57, %rd39;
+; CHECK-NEXT: sub.cc.s64 %rd41, %rd3, %rd36;
+; CHECK-NEXT: subc.cc.s64 %rd42, %rd4, %rd33;
+; CHECK-NEXT: shr.s64 %rd43, %rd42, 63;
+; CHECK-NEXT: and.b64 %rd56, %rd43, 1;
+; CHECK-NEXT: and.b64 %rd44, %rd43, %rd1;
+; CHECK-NEXT: and.b64 %rd45, %rd43, %rd2;
+; CHECK-NEXT: sub.cc.s64 %rd60, %rd36, %rd44;
+; CHECK-NEXT: subc.cc.s64 %rd61, %rd33, %rd45;
; CHECK-NEXT: add.cc.s64 %rd58, %rd58, -1;
; CHECK-NEXT: addc.cc.s64 %rd59, %rd59, -1;
-; CHECK-NEXT: or.b64 %rd45, %rd58, %rd59;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0;
+; CHECK-NEXT: or.b64 %rd46, %rd58, %rd59;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd46, 0;
; CHECK-NEXT: @%p17 bra $L__BB1_4;
; CHECK-NEXT: bra.uni $L__BB1_2;
; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd46, %rd62, 63;
-; CHECK-NEXT: shl.b64 %rd47, %rd63, 1;
-; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT: shr.u64 %rd47, %rd62, 63;
+; CHECK-NEXT: shl.b64 %rd48, %rd63, 1;
+; CHECK-NEXT: or.b64 %rd65, %rd48, %rd47;
; CHECK-NEXT: shl.b64 %rd49, %rd62, 1;
-; CHECK-NEXT: or.b64 %rd64, %rd57, %rd49;
-; CHECK-NEXT: or.b64 %rd65, %rd56, %rd48;
+; CHECK-NEXT: or.b64 %rd64, %rd56, %rd49;
; CHECK-NEXT: $L__BB1_5: // %udiv-end
; CHECK-NEXT: mul.hi.u64 %rd50, %rd1, %rd64;
; CHECK-NEXT: mad.lo.s64 %rd51, %rd1, %rd65, %rd50;
@@ -344,90 +342,89 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd24, %r4;
; CHECK-NEXT: add.s64 %rd25, %rd24, 64;
; CHECK-NEXT: selp.b64 %rd26, %rd23, %rd25, %p7;
-; CHECK-NEXT: mov.b64 %rd65, 0;
-; CHECK-NEXT: sub.cc.s64 %rd27, %rd22, %rd26;
-; CHECK-NEXT: subc.cc.s64 %rd28, %rd65, 0;
-; CHECK-NEXT: setp.gt.u64 %p8, %rd27, 127;
-; CHECK-NEXT: setp.eq.b64 %p9, %rd28, 0;
+; CHECK-NEXT: mov.b64 %rd27, 0;
+; CHECK-NEXT: sub.cc.s64 %rd28, %rd22, %rd26;
+; CHECK-NEXT: subc.cc.s64 %rd29, %rd27, 0;
+; CHECK-NEXT: setp.gt.u64 %p8, %rd28, 127;
+; CHECK-NEXT: setp.eq.b64 %p9, %rd29, 0;
; CHECK-NEXT: and.pred %p10, %p9, %p8;
-; CHECK-NEXT: setp.ne.b64 %p11, %rd28, 0;
+; CHECK-NEXT: setp.ne.b64 %p11, %rd29, 0;
; CHECK-NEXT: or.pred %p12, %p10, %p11;
; CHECK-NEXT: or.pred %p13, %p5, %p12;
-; CHECK-NEXT: xor.b64 %rd29, %rd27, 127;
-; CHECK-NEXT: or.b64 %rd30, %rd29, %rd28;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd30, 0;
+; CHECK-NEXT: xor.b64 %rd30, %rd28, 127;
+; CHECK-NEXT: or.b64 %rd31, %rd30, %rd29;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd31, 0;
; CHECK-NEXT: selp.b64 %rd73, 0, %rd2, %p13;
; CHECK-NEXT: selp.b64 %rd72, 0, %rd1, %p13;
; CHECK-NEXT: or.pred %p15, %p13, %p14;
; CHECK-NEXT: @%p15 bra $L__BB4_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd66, %rd27, 1;
-; CHECK-NEXT: addc.cc.s64 %rd67, %rd28, 0;
-; CHECK-NEXT: or.b64 %rd31, %rd66, %rd67;
-; CHECK-NEXT: setp.eq.b64 %p16, %rd31, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd27;
+; CHECK-NEXT: add.cc.s64 %rd66, %rd28, 1;
+; CHECK-NEXT: addc.cc.s64 %rd67, %rd29, 0;
+; CHECK-NEXT: or.b64 %rd32, %rd66, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p16, %rd32, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd28;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd32, %rd2, %r6;
+; CHECK-NEXT: shl.b64 %rd33, %rd2, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd33, %rd1, %r7;
-; CHECK-NEXT: or.b64 %rd34, %rd32, %rd33;
+; CHECK-NEXT: shr.u64 %rd34, %rd1, %r7;
+; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd35, %rd1, %r8;
+; CHECK-NEXT: shl.b64 %rd36, %rd1, %r8;
; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd71, %rd35, %rd34, %p17;
+; CHECK-NEXT: selp.b64 %rd71, %rd36, %rd35, %p17;
; CHECK-NEXT: shl.b64 %rd70, %rd1, %r6;
+; CHECK-NEXT: mov.b64 %rd65, 0;
; CHECK-NEXT: mov.b64 %rd64, %rd65;
; CHECK-NEXT: @%p16 bra $L__BB4_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd66;
-; CHECK-NEXT: shr.u64 %rd36, %rd1, %r9;
+; CHECK-NEXT: shr.u64 %rd37, %rd1, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd37, %rd2, %r10;
-; CHECK-NEXT: or.b64 %rd38, %rd36, %rd37;
+; CHECK-NEXT: shl.b64 %rd38, %rd2, %r10;
+; CHECK-NEXT: or.b64 %rd39, %rd37, %rd38;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd39, %rd2, %r11;
+; CHECK-NEXT: shr.u64 %rd40, %rd2, %r11;
; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd68, %rd39, %rd38, %p18;
+; CHECK-NEXT: selp.b64 %rd68, %rd40, %rd39, %p18;
; CHECK-NEXT: shr.u64 %rd69, %rd2, %r9;
; CHECK-NEXT: add.cc.s64 %rd6, %rd3, -1;
; CHECK-NEXT: addc.cc.s64 %rd7, %rd4, -1;
-; CHECK-NEXT: mov.b64 %rd64, 0;
-; CHECK-NEXT: mov.b64 %rd65, %rd64;
+; CHECK-NEXT: mov.b64 %rd64, %rd65;
; CHECK-NEXT: $L__BB4_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd40, %rd68, 63;
-; CHECK-NEXT: shl.b64 %rd41, %rd69, 1;
-; CHECK-NEXT: or.b64 %rd42, %rd41, %rd40;
-; CHECK-NEXT: shl.b64 %rd43, %rd68, 1;
-; CHECK-NEXT: shr.u64 %rd44, %rd71, 63;
-; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44;
-; CHECK-NEXT: shr.u64 %rd46, %rd70, 63;
-; CHECK-NEXT: shl.b64 %rd47, %rd71, 1;
-; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46;
-; CHECK-NEXT: shl.b64 %rd49, %rd70, 1;
-; CHECK-NEXT: or.b64 %rd70, %rd65, %rd49;
-; CHECK-NEXT: or.b64 %rd71, %rd64, %rd48;
-; CHECK-NEXT: sub.cc.s64 %rd50, %rd6, %rd45;
-; CHECK-NEXT: subc.cc.s64 %rd51, %rd7, %rd42;
-; CHECK-NEXT: shr.s64 %rd52, %rd51, 63;
-; CHECK-NEXT: and.b64 %rd65, %rd52, 1;
-; CHECK-NEXT: and.b64 %rd53, %rd52, %rd3;
-; CHECK-NEXT: and.b64 %rd54, %rd52, %rd4;
-; CHECK-NEXT: sub.cc.s64 %rd68, %rd45, %rd53;
-; CHECK-NEXT: subc.cc.s64 %rd69, %rd42, %rd54;
+; CHECK-NEXT: shr.u64 %rd41, %rd68, 63;
+; CHECK-NEXT: shl.b64 %rd42, %rd69, 1;
+; CHECK-NEXT: or.b64 %rd43, %rd42, %rd41;
+; CHECK-NEXT: shl.b64 %rd44, %rd68, 1;
+; CHECK-NEXT: shr.u64 %rd45, %rd71, 63;
+; CHECK-NEXT: or.b64 %rd46, %rd44, %rd45;
+; CHECK-NEXT: shr.u64 %rd47, %rd70, 63;
+; CHECK-NEXT: shl.b64 %rd48, %rd71, 1;
+; CHECK-NEXT: or.b64 %rd49, %rd48, %rd47;
+; CHECK-NEXT: shl.b64 %rd50, %rd70, 1;
+; CHECK-NEXT: or.b64 %rd70, %rd64, %rd50;
+; CHECK-NEXT: or.b64 %rd71, %rd65, %rd49;
+; CHECK-NEXT: sub.cc.s64 %rd51, %rd6, %rd46;
+; CHECK-NEXT: subc.cc.s64 %rd52, %rd7, %rd43;
+; CHECK-NEXT: shr.s64 %rd53, %rd52, 63;
+; CHECK-NEXT: and.b64 %rd64, %rd53, 1;
+; CHECK-NEXT: and.b64 %rd54, %rd53, %rd3;
+; CHECK-NEXT: and.b64 %rd55, %rd53, %rd4;
+; CHECK-NEXT: sub.cc.s64 %rd68, %rd46, %rd54;
+; CHECK-NEXT: subc.cc.s64 %rd69, %rd43, %rd55;
; CHECK-NEXT: add.cc.s64 %rd66, %rd66, -1;
; CHECK-NEXT: addc.cc.s64 %rd67, %rd67, -1;
-; CHECK-NEXT: or.b64 %rd55, %rd66, %rd67;
-; CHECK-NEXT: setp.eq.b64 %p19, %rd55, 0;
+; CHECK-NEXT: or.b64 %rd56, %rd66, %rd67;
+; CHECK-NEXT: setp.eq.b64 %p19, %rd56, 0;
; CHECK-NEXT: @%p19 bra $L__BB4_4;
; CHECK-NEXT: bra.uni $L__BB4_2;
; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd56, %rd70, 63;
-; CHECK-NEXT: shl.b64 %rd57, %rd71, 1;
-; CHECK-NEXT: or.b64 %rd58, %rd57, %rd56;
+; CHECK-NEXT: shr.u64 %rd57, %rd70, 63;
+; CHECK-NEXT: shl.b64 %rd58, %rd71, 1;
+; CHECK-NEXT: or.b64 %rd73, %rd58, %rd57;
; CHECK-NEXT: shl.b64 %rd59, %rd70, 1;
-; CHECK-NEXT: or.b64 %rd72, %rd65, %rd59;
-; CHECK-NEXT: or.b64 %rd73, %rd64, %rd58;
+; CHECK-NEXT: or.b64 %rd72, %rd64, %rd59;
; CHECK-NEXT: $L__BB4_5: // %udiv-end
; CHECK-NEXT: xor.b64 %rd60, %rd72, %rd5;
; CHECK-NEXT: xor.b64 %rd61, %rd73, %rd5;
@@ -468,90 +465,89 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
; CHECK-NEXT: cvt.u64.u32 %rd14, %r4;
; CHECK-NEXT: add.s64 %rd15, %rd14, 64;
; CHECK-NEXT: selp.b64 %rd16, %rd13, %rd15, %p5;
-; CHECK-NEXT: mov.b64 %rd51, 0;
-; CHECK-NEXT: sub.cc.s64 %rd17, %rd12, %rd16;
-; CHECK-NEXT: subc.cc.s64 %rd18, %rd51, 0;
-; CHECK-NEXT: setp.gt.u64 %p6, %rd17, 127;
-; CHECK-NEXT: setp.eq.b64 %p7, %rd18, 0;
+; CHECK-NEXT: mov.b64 %rd17, 0;
+; CHECK-NEXT: sub.cc.s64 %rd18, %rd12, %rd16;
+; CHECK-NEXT: subc.cc.s64 %rd19, %rd17, 0;
+; CHECK-NEXT: setp.gt.u64 %p6, %rd18, 127;
+; CHECK-NEXT: setp.eq.b64 %p7, %rd19, 0;
; CHECK-NEXT: and.pred %p8, %p7, %p6;
-; CHECK-NEXT: setp.ne.b64 %p9, %rd18, 0;
+; CHECK-NEXT: setp.ne.b64 %p9, %rd19, 0;
; CHECK-NEXT: or.pred %p10, %p8, %p9;
; CHECK-NEXT: or.pred %p11, %p3, %p10;
-; CHECK-NEXT: xor.b64 %rd19, %rd17, 127;
-; CHECK-NEXT: or.b64 %rd20, %rd19, %rd18;
-; CHECK-NEXT: setp.eq.b64 %p12, %rd20, 0;
+; CHECK-NEXT: xor.b64 %rd20, %rd18, 127;
+; CHECK-NEXT: or.b64 %rd21, %rd20, %rd19;
+; CHECK-NEXT: setp.eq.b64 %p12, %rd21, 0;
; CHECK-NEXT: selp.b64 %rd59, 0, %rd4, %p11;
; CHECK-NEXT: selp.b64 %rd58, 0, %rd3, %p11;
; CHECK-NEXT: or.pred %p13, %p11, %p12;
; CHECK-NEXT: @%p13 bra $L__BB5_5;
; CHECK-NEXT: // %bb.3: // %udiv-bb1
-; CHECK-NEXT: add.cc.s64 %rd52, %rd17, 1;
-; CHECK-NEXT: addc.cc.s64 %rd53, %rd18, 0;
-; CHECK-NEXT: or.b64 %rd21, %rd52, %rd53;
-; CHECK-NEXT: setp.eq.b64 %p14, %rd21, 0;
-; CHECK-NEXT: cvt.u32.u64 %r5, %rd17;
+; CHECK-NEXT: add.cc.s64 %rd52, %rd18, 1;
+; CHECK-NEXT: addc.cc.s64 %rd53, %rd19, 0;
+; CHECK-NEXT: or.b64 %rd22, %rd52, %rd53;
+; CHECK-NEXT: setp.eq.b64 %p14, %rd22, 0;
+; CHECK-NEXT: cvt.u32.u64 %r5, %rd18;
; CHECK-NEXT: sub.s32 %r6, 127, %r5;
-; CHECK-NEXT: shl.b64 %rd22, %rd4, %r6;
+; CHECK-NEXT: shl.b64 %rd23, %rd4, %r6;
; CHECK-NEXT: sub.s32 %r7, 64, %r6;
-; CHECK-NEXT: shr.u64 %rd23, %rd3, %r7;
-; CHECK-NEXT: or.b64 %rd24, %rd22, %rd23;
+; CHECK-NEXT: shr.u64 %rd24, %rd3, %r7;
+; CHECK-NEXT: or.b64 %rd25, %rd23, %rd24;
; CHECK-NEXT: sub.s32 %r8, 63, %r5;
-; CHECK-NEXT: shl.b64 %rd25, %rd3, %r8;
+; CHECK-NEXT: shl.b64 %rd26, %rd3, %r8;
; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT: selp.b64 %rd57, %rd25, %rd24, %p15;
+; CHECK-NEXT: selp.b64 %rd57, %rd26, %rd25, %p15;
; CHECK-NEXT: shl.b64 %rd56, %rd3, %r6;
+; CHECK-NEXT: mov.b64 %rd51, 0;
; CHECK-NEXT: mov.b64 %rd50, %rd51;
; CHECK-NEXT: @%p14 bra $L__BB5_4;
; CHECK-NEXT: // %bb.1: // %udiv-preheader
; CHECK-NEXT: cvt.u32.u64 %r9, %rd52;
-; CHECK-NEXT: shr.u64 %rd26, %rd3, %r9;
+; CHECK-NEXT: shr.u64 %rd27, %rd3, %r9;
; CHECK-NEXT: sub.s32 %r10, 64, %r9;
-; CHECK-NEXT: shl.b64 %rd27, %rd4, %r10;
-; CHECK-NEXT: or.b64 %rd28, %rd26, %rd27;
+; CHECK-NEXT: shl.b64 %rd28, %rd4, %r10;
+; CHECK-NEXT: or.b64 %rd29, %rd27, %rd28;
; CHECK-NEXT: add.s32 %r11, %r9, -64;
-; CHECK-NEXT: shr.u64 %rd29, %rd4, %r11;
+; CHECK-NEXT: shr.u64 %rd30, %rd4, %r11;
; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT: selp.b64 %rd54, %rd29, %rd28, %p16;
+; CHECK-NEXT: selp.b64 %rd54, %rd30, %rd29, %p16;
; CHECK-NEXT: shr.u64 %rd55, %rd4, %r9;
; CHECK-NEXT: add.cc.s64 %rd1, %rd5, -1;
; CHECK-NEXT: addc.cc.s64 %rd2, %rd6, -1;
-; CHECK-NEXT: mov.b64 %rd50, 0;
-; CHECK-NEXT: mov.b64 %rd51, %rd50;
+; CHECK-NEXT: mov.b64 %rd50, %rd51;
; CHECK-NEXT: $L__BB5_2: // %udiv-do-while
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u64 %rd30, %rd54, 63;
-; CHECK-NEXT: shl.b64 %rd31, %rd55, 1;
-; CHECK-NEXT: or.b64 %rd32, %rd31, %rd30;
-; CHECK-NEXT: shl.b64 %rd33, %rd54, 1;
-; CHECK-NEXT: shr.u64 %rd34, %rd57, 63;
-; CHECK-NEXT: or.b64 %rd35, %rd33, %rd34;
-; CHECK-NEXT: shr.u64 %rd36, %rd56, 63;
-; CHECK-NEXT: shl.b64 %rd37, %rd57, 1;
-; CHECK-NEXT: or.b64 %rd38, %rd37, %rd36;
-; CHECK-NEXT: shl.b64 %rd39, %rd56, 1;
-; CHECK-NEXT: or.b64 %rd56, %rd51, %rd39;
-; CHECK-NEXT: or.b64 %rd57, %rd50, %rd38;
-; CHECK-NEXT: sub.cc.s64 %rd40, %rd1, %rd35;
-; CHECK-NEXT: subc.cc.s64 %rd41, %rd2, %rd32;
-; CHECK-NEXT: shr.s64 %rd42, %rd41, 63;
-; CHECK-NEXT: and.b64 %rd51, %rd42, 1;
-; CHECK-NEXT: and.b64 %rd43, %rd42, %rd5;
-; CHECK-NEXT: and.b64 %rd44, %rd42, %rd6;
-; CHECK-NEXT: sub.cc.s64 %rd54, %rd35, %rd43;
-; CHECK-NEXT: subc.cc.s64 %rd55, %rd32, %rd44;
+; CHECK-NEXT: shr.u64 %rd31, %rd54, 63;
+; CHECK-NEXT: shl.b64 %rd32, %rd55, 1;
+; CHECK-NEXT: or.b64 %rd33, %rd32, %rd31;
+; CHECK-NEXT: shl.b64 %rd34, %rd54, 1;
+; CHECK-NEXT: shr.u64 %rd35, %rd57, 63;
+; CHECK-NEXT: or.b64 %rd36, %rd34, %rd35;
+; CHECK-NEXT: shr.u64 %rd37, %rd56, 63;
+; CHECK-NEXT: shl.b64 %rd38, %rd57, 1;
+; CHECK-NEXT: or.b64 %rd39, %rd38, %rd37;
+; CHECK-NEXT: shl.b64 %rd40, %rd56, 1;
+; CHECK-NEXT: or.b64 %rd56, %rd50, %rd40;
+; CHECK-NEXT: or.b64 %rd57, %rd51, %rd39;
+; CHECK-NEXT: sub.cc.s64 %rd41, %rd1, %rd36;
+; CHECK-NEXT: subc.cc.s64 %rd42, %rd2, %rd33;
+; CHECK-NEXT: shr.s64 %rd43, %rd42, 63;
+; CHECK-NEXT: and.b64 %rd50, %rd43, 1;
+; CHECK-NEXT: and.b64 %rd44, %rd43, %rd5;
+; CHECK-NEXT: and.b64 %rd45, %rd43, %rd6;
+; CHECK-NEXT: sub.cc.s64 %rd54, %rd36, %rd44;
+; CHECK-NEXT: subc.cc.s64 %rd55, %rd33, %rd45;
; CHECK-NEXT: add.cc.s64 %rd52, %rd52, -1;
; CHECK-NEXT: addc.cc.s64 %rd53, %rd53, -1;
-; CHECK-NEXT: or.b64 %rd45, %rd52, %rd53;
-; CHECK-NEXT: setp.eq.b64 %p17, %rd45, 0;
+; CHECK-NEXT: or.b64 %rd46, %rd52, %rd53;
+; CHECK-NEXT: setp.eq.b64 %p17, %rd46, 0;
; CHECK-NEXT: @%p17 bra $L__BB5_4;
; CHECK-NEXT: bra.uni $L__BB5_2;
; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT: shr.u64 %rd46, %rd56, 63;
-; CHECK-NEXT: shl.b64 %rd47, %rd57, 1;
-; CHECK-NEXT: or.b64 %rd48, %rd47, %rd46;
+; CHECK-NEXT: shr.u64 %rd47, %rd56, 63;
+; CHECK-NEXT: shl.b64 %rd48, %rd57, 1;
+; CHECK-NEXT: or.b64 %rd59, %rd48, %rd47;
; CHECK-NEXT: shl.b64 %rd49, %rd56, 1;
-; CHECK-NEXT: or.b64 %rd58, %rd51, %rd49;
-; CHECK-NEXT: or.b64 %rd59, %rd50, %rd48;
+; CHECK-NEXT: or.b64 %rd58, %rd50, %rd49;
; CHECK-NEXT: $L__BB5_5: // %udiv-end
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd58, %rd59};
; CHECK-NEXT: ret;
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 976c57e422761..7f089782d87e0 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -402,14 +402,13 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_3:
; RV32I-NEXT: li a0, 64
-; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: srli s1, s1, 27
; RV32I-NEXT: add s1, s3, s1
; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: .LBB3_5: # %cond.false
+; RV32I-NEXT: .LBB3_5: # %cond.end
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -463,8 +462,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_3:
-; RV32M-NEXT: li a1, 0
; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB3_4:
; RV32M-NEXT: neg a1, a0
@@ -527,8 +526,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32XTHEADBB-NEXT: li a1, 64
; RV32XTHEADBB-NEXT: j .LBB3_5
; RV32XTHEADBB-NEXT: .LBB3_3:
-; RV32XTHEADBB-NEXT: li a1, 0
; RV32XTHEADBB-NEXT: li a0, 64
+; RV32XTHEADBB-NEXT: li a1, 0
; RV32XTHEADBB-NEXT: ret
; RV32XTHEADBB-NEXT: .LBB3_4:
; RV32XTHEADBB-NEXT: addi a1, a0, -1
@@ -1414,8 +1413,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB11_3:
-; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB11_4:
; RV32I-NEXT: srli a0, a1, 1
@@ -1540,8 +1539,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB11_3:
-; RV32M-NEXT: li a1, 0
; RV32M-NEXT: li a0, 64
+; RV32M-NEXT: li a1, 0
; RV32M-NEXT: ret
; RV32M-NEXT: .LBB11_4:
; RV32M-NEXT: srli a0, a1, 1
diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll
index d7b00f61a50b9..1d13f723ac224 100644
--- a/llvm/test/CodeGen/RISCV/idiv_large.ll
+++ b/llvm/test/CodeGen/RISCV/idiv_large.ll
@@ -102,7 +102,6 @@ define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
; RV32-NEXT: sw s3, 80(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s4, 76(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s5, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 68(sp) # 4-byte Folded Spill
; RV32-NEXT: slli a2, a3, 31
; RV32-NEXT: li t5, 64
; RV32-NEXT: bnez a2, .LBB1_5
@@ -271,183 +270,173 @@ define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
; RV32-NEXT: or a1, a1, a6
; RV32-NEXT: andi a6, t6, 1
; RV32-NEXT: sub a7, a7, t0
-; RV32-NEXT: sub t5, a7, t5
+; RV32-NEXT: sub t0, a7, t5
; RV32-NEXT: sub a7, t3, s3
; RV32-NEXT: beqz a6, .LBB1_19
; RV32-NEXT: # %bb.18: # %_udiv-special-cases
-; RV32-NEXT: mv t0, a6
+; RV32-NEXT: mv t3, a6
; RV32-NEXT: j .LBB1_20
; RV32-NEXT: .LBB1_19:
-; RV32-NEXT: sltiu t0, a7, 65
-; RV32-NEXT: xori t0, t0, 1
-; RV32-NEXT: snez t3, t5
-; RV32-NEXT: or t0, t0, t3
+; RV32-NEXT: sltiu t3, a7, 65
+; RV32-NEXT: xori t3, t3, 1
+; RV32-NEXT: snez t5, t0
+; RV32-NEXT: or t3, t3, t5
; RV32-NEXT: .LBB1_20: # %_udiv-special-cases
-; RV32-NEXT: or t6, a1, t0
-; RV32-NEXT: addi a1, t6, -1
-; RV32-NEXT: and t3, t4, a1
-; RV32-NEXT: and t0, a1, a2
-; RV32-NEXT: and a1, a1, a5
-; RV32-NEXT: bnez t6, .LBB1_30
+; RV32-NEXT: or t6, a1, t3
+; RV32-NEXT: addi t5, t6, -1
+; RV32-NEXT: and a1, t4, t5
+; RV32-NEXT: and t3, t5, a2
+; RV32-NEXT: and t5, t5, a5
+; RV32-NEXT: bnez t6, .LBB1_29
; RV32-NEXT: # %bb.21: # %_udiv-special-cases
; RV32-NEXT: xori t6, a7, 64
; RV32-NEXT: or t6, t6, a6
-; RV32-NEXT: or t6, t6, t5
-; RV32-NEXT: beqz t6, .LBB1_30
+; RV32-NEXT: or t6, t6, t0
+; RV32-NEXT: beqz t6, .LBB1_29
; RV32-NEXT: # %bb.22: # %udiv-bb1
; RV32-NEXT: addi a1, a7, 1
-; RV32-NEXT: sw zero, 32(sp)
-; RV32-NEXT: sw zero, 36(sp)
; RV32-NEXT: sw zero, 40(sp)
; RV32-NEXT: sw zero, 44(sp)
-; RV32-NEXT: sw a5, 48(sp)
-; RV32-NEXT: sw a2, 52(sp)
-; RV32-NEXT: sw t4, 56(sp)
-; RV32-NEXT: li t0, 64
-; RV32-NEXT: addi t3, sp, 48
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw zero, 52(sp)
+; RV32-NEXT: sw a5, 56(sp)
+; RV32-NEXT: sw a2, 60(sp)
+; RV32-NEXT: sw t4, 64(sp)
+; RV32-NEXT: li t3, 64
+; RV32-NEXT: addi t5, sp, 56
; RV32-NEXT: neg s1, a7
; RV32-NEXT: seqz t6, a1
-; RV32-NEXT: sub a7, t0, a7
-; RV32-NEXT: add t5, t5, t6
-; RV32-NEXT: andi t0, a7, 31
+; RV32-NEXT: sub a7, t3, a7
+; RV32-NEXT: add t0, t0, t6
+; RV32-NEXT: andi t3, a7, 31
; RV32-NEXT: srli a7, a7, 3
-; RV32-NEXT: or t6, a1, t5
-; RV32-NEXT: xori s2, t0, 31
+; RV32-NEXT: or t6, a1, t0
+; RV32-NEXT: xori s2, t3, 31
; RV32-NEXT: andi a7, a7, 12
-; RV32-NEXT: seqz t0, t6
-; RV32-NEXT: sub s3, t3, a7
-; RV32-NEXT: add a6, a6, t0
-; RV32-NEXT: lw t3, 0(s3)
+; RV32-NEXT: seqz t3, t6
+; RV32-NEXT: sub s3, t5, a7
+; RV32-NEXT: add a6, a6, t3
+; RV32-NEXT: lw a7, 0(s3)
; RV32-NEXT: lw s4, 4(s3)
-; RV32-NEXT: andi a7, a6, 1
-; RV32-NEXT: or t6, t6, a7
-; RV32-NEXT: srli a6, t3, 1
-; RV32-NEXT: sll t0, s4, s1
-; RV32-NEXT: srl a6, a6, s2
-; RV32-NEXT: or t0, t0, a6
-; RV32-NEXT: sll a6, t3, s1
-; RV32-NEXT: li t3, 0
+; RV32-NEXT: andi a6, a6, 1
+; RV32-NEXT: or t6, t6, a6
+; RV32-NEXT: srli t3, a7, 1
+; RV32-NEXT: sll t5, s4, s1
+; RV32-NEXT: srl t3, t3, s2
+; RV32-NEXT: or t5, t5, t3
+; RV32-NEXT: sll t3, a7, s1
+; RV32-NEXT: li a7, 0
; RV32-NEXT: beqz t6, .LBB1_28
; RV32-NEXT: # %bb.23: # %udiv-preheader
; RV32-NEXT: li t6, 0
; RV32-NEXT: li s0, 0
; RV32-NEXT: srli s4, s4, 1
; RV32-NEXT: lw s3, 8(s3)
-; RV32-NEXT: sw zero, 16(sp)
-; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: sw zero, 24(sp)
; RV32-NEXT: sw zero, 28(sp)
-; RV32-NEXT: sw a5, 0(sp)
-; RV32-NEXT: sw a2, 4(sp)
-; RV32-NEXT: sw t4, 8(sp)
-; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw zero, 36(sp)
+; RV32-NEXT: sw a5, 8(sp)
+; RV32-NEXT: sw a2, 12(sp)
+; RV32-NEXT: sw t4, 16(sp)
+; RV32-NEXT: sw zero, 20(sp)
; RV32-NEXT: srli a2, a1, 3
; RV32-NEXT: srl a5, s4, s2
-; RV32-NEXT: mv t4, sp
+; RV32-NEXT: addi t4, sp, 8
; RV32-NEXT: snez t2, t2
; RV32-NEXT: andi a2, a2, 12
; RV32-NEXT: add t1, t1, t2
; RV32-NEXT: add a2, t4, a2
-; RV32-NEXT: lw t2, 0(a2)
-; RV32-NEXT: lw t4, 4(a2)
+; RV32-NEXT: lw t4, 0(a2)
+; RV32-NEXT: lw t2, 4(a2)
; RV32-NEXT: lw a2, 8(a2)
; RV32-NEXT: sll s1, s3, s1
; RV32-NEXT: andi s2, a1, 31
; RV32-NEXT: xori s2, s2, 31
-; RV32-NEXT: or s3, s1, a5
+; RV32-NEXT: or s1, s1, a5
; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: slli a5, t4, 1
+; RV32-NEXT: slli a5, t2, 1
; RV32-NEXT: sll a2, a2, s2
; RV32-NEXT: sll s2, a5, s2
-; RV32-NEXT: srl s1, t4, a1
-; RV32-NEXT: or s1, s1, a2
+; RV32-NEXT: srl a5, t2, a1
+; RV32-NEXT: or t2, a5, a2
; RV32-NEXT: seqz a2, a3
; RV32-NEXT: sub a2, a4, a2
; RV32-NEXT: addi a5, t1, 1
; RV32-NEXT: andi a5, a5, 1
-; RV32-NEXT: andi s3, s3, 1
-; RV32-NEXT: srl t1, t2, a1
-; RV32-NEXT: or s2, t1, s2
+; RV32-NEXT: andi s1, s1, 1
+; RV32-NEXT: srl t1, t4, a1
+; RV32-NEXT: or t4, t1, s2
; RV32-NEXT: addi t1, a3, -1
; RV32-NEXT: j .LBB1_26
; RV32-NEXT: .LBB1_24: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
-; RV32-NEXT: sltu t2, a2, s4
+; RV32-NEXT: sltu s1, a2, s2
; RV32-NEXT: .LBB1_25: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB1_26 Depth=1
-; RV32-NEXT: srli s1, s1, 31
-; RV32-NEXT: sub t4, a5, s1
-; RV32-NEXT: sub t2, t4, t2
+; RV32-NEXT: srli t2, t2, 31
+; RV32-NEXT: sub t2, a5, t2
+; RV32-NEXT: sub t2, t2, s1
; RV32-NEXT: slli t2, t2, 31
-; RV32-NEXT: srai s1, t2, 31
-; RV32-NEXT: and s3, s1, a4
-; RV32-NEXT: li t2, 0
-; RV32-NEXT: li t4, 0
-; RV32-NEXT: srli s5, a6, 31
-; RV32-NEXT: sub s4, s4, s3
-; RV32-NEXT: slli s3, t0, 1
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli t0, t0, 31
-; RV32-NEXT: slli a6, a6, 1
-; RV32-NEXT: or a6, t3, a6
-; RV32-NEXT: seqz t3, a1
-; RV32-NEXT: or s0, s0, t0
-; RV32-NEXT: or s5, a1, t5
-; RV32-NEXT: sub t5, t5, t3
-; RV32-NEXT: and s6, s1, a3
+; RV32-NEXT: srai t2, t2, 31
+; RV32-NEXT: and s1, t2, a4
+; RV32-NEXT: srli s3, t3, 31
+; RV32-NEXT: slli s4, t5, 1
+; RV32-NEXT: srli t5, t5, 31
+; RV32-NEXT: slli t3, t3, 1
+; RV32-NEXT: sub s2, s2, s1
+; RV32-NEXT: and s5, t2, a3
+; RV32-NEXT: or s1, s4, s3
+; RV32-NEXT: seqz s3, a1
+; RV32-NEXT: or t3, a7, t3
+; RV32-NEXT: or s4, a1, t0
; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: andi t3, s1, 1
-; RV32-NEXT: or t0, t6, s3
-; RV32-NEXT: sltu t6, s2, s6
-; RV32-NEXT: snez s5, s5
-; RV32-NEXT: andi s3, s0, 1
-; RV32-NEXT: sub s1, s4, t6
-; RV32-NEXT: add a7, a7, s5
-; RV32-NEXT: addi a7, a7, 1
-; RV32-NEXT: andi a7, a7, 1
-; RV32-NEXT: or t6, a1, t5
-; RV32-NEXT: or s4, t6, a7
-; RV32-NEXT: sub s2, s2, s6
+; RV32-NEXT: or s0, s0, t5
+; RV32-NEXT: andi a7, t2, 1
+; RV32-NEXT: sltu t2, t4, s5
+; RV32-NEXT: sub t0, t0, s3
+; RV32-NEXT: snez s3, s4
+; RV32-NEXT: or t5, t6, s1
+; RV32-NEXT: andi s1, s0, 1
+; RV32-NEXT: sub t2, s2, t2
+; RV32-NEXT: add a6, a6, s3
+; RV32-NEXT: addi a6, a6, 1
+; RV32-NEXT: andi a6, a6, 1
+; RV32-NEXT: or t6, a1, t0
+; RV32-NEXT: or s2, t6, a6
+; RV32-NEXT: sub t4, t4, s5
; RV32-NEXT: li t6, 0
; RV32-NEXT: li s0, 0
-; RV32-NEXT: beqz s4, .LBB1_29
+; RV32-NEXT: beqz s2, .LBB1_28
; RV32-NEXT: .LBB1_26: # %udiv-do-while
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32-NEXT: srli t2, s2, 31
-; RV32-NEXT: slli t4, s1, 1
-; RV32-NEXT: slli s2, s2, 1
-; RV32-NEXT: or s4, t4, t2
-; RV32-NEXT: andi t2, s3, 1
-; RV32-NEXT: or s2, s2, t2
-; RV32-NEXT: bne a2, s4, .LBB1_24
+; RV32-NEXT: srli s2, t4, 31
+; RV32-NEXT: slli s3, t2, 1
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: or s2, s3, s2
+; RV32-NEXT: andi s1, s1, 1
+; RV32-NEXT: or t4, t4, s1
+; RV32-NEXT: bne a2, s2, .LBB1_24
; RV32-NEXT: # %bb.27: # in Loop: Header=BB1_26 Depth=1
-; RV32-NEXT: sltu t2, t1, s2
+; RV32-NEXT: sltu s1, t1, t4
; RV32-NEXT: j .LBB1_25
-; RV32-NEXT: .LBB1_28:
-; RV32-NEXT: li t2, 0
-; RV32-NEXT: li t4, 0
-; RV32-NEXT: .LBB1_29: # %udiv-loop-exit
-; RV32-NEXT: srli a2, a6, 31
-; RV32-NEXT: slli a3, t0, 1
-; RV32-NEXT: srli a4, t0, 31
-; RV32-NEXT: slli a6, a6, 1
-; RV32-NEXT: or a1, t3, a6
-; RV32-NEXT: or a2, t2, a2
-; RV32-NEXT: or a4, t4, a4
-; RV32-NEXT: or t0, a2, a3
-; RV32-NEXT: andi t3, a4, 1
-; RV32-NEXT: .LBB1_30: # %udiv-end
-; RV32-NEXT: andi a2, t3, 1
-; RV32-NEXT: sw a1, 0(a0)
-; RV32-NEXT: sw t0, 4(a0)
-; RV32-NEXT: sb a2, 8(a0)
+; RV32-NEXT: .LBB1_28: # %udiv-loop-exit
+; RV32-NEXT: srli a2, t3, 31
+; RV32-NEXT: slli a3, t5, 1
+; RV32-NEXT: srli a1, t5, 31
+; RV32-NEXT: slli a4, t3, 1
+; RV32-NEXT: or t3, a3, a2
+; RV32-NEXT: or t5, a7, a4
+; RV32-NEXT: .LBB1_29: # %udiv-end
+; RV32-NEXT: sw t5, 0(a0)
+; RV32-NEXT: sw t3, 4(a0)
+; RV32-NEXT: sb a1, 8(a0)
; RV32-NEXT: lw s0, 92(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 88(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 84(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s3, 80(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s4, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s5, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 68(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 96
; RV32-NEXT: ret
;
@@ -468,563 +457,551 @@ define i65 @udiv_i65(i65 %x, i65 %y) nounwind {
define i128 @udiv_i128(i128 %x, i128 %y) nounwind {
; RV32-LABEL: udiv_i128:
; RV32: # %bb.0: # %_udiv-special-cases
-; RV32-NEXT: addi sp, sp, -160
-; RV32-NEXT: sw ra, 156(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 152(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 148(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 144(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 140(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 136(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s5, 132(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s6, 128(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s7, 124(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s8, 120(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s9, 116(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s10, 112(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s11, 108(sp) # 4-byte Folded Spill
-; RV32-NEXT: mv s7, a0
-; RV32-NEXT: lw s8, 0(a2)
-; RV32-NEXT: lw s9, 4(a2)
-; RV32-NEXT: lw s11, 8(a2)
-; RV32-NEXT: lw ra, 12(a2)
+; RV32-NEXT: addi sp, sp, -144
+; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a4, a0
+; RV32-NEXT: lw ra, 0(a2)
+; RV32-NEXT: lw a5, 4(a2)
+; RV32-NEXT: lw s9, 8(a2)
+; RV32-NEXT: lw s10, 12(a2)
; RV32-NEXT: lui t4, 349525
; RV32-NEXT: addi t4, t4, 1365
; RV32-NEXT: lui t3, 209715
; RV32-NEXT: addi t3, t3, 819
-; RV32-NEXT: lui t2, 61681
-; RV32-NEXT: addi t2, t2, -241
-; RV32-NEXT: bnez s9, .LBB2_2
+; RV32-NEXT: lui a7, 61681
+; RV32-NEXT: addi a7, a7, -241
+; RV32-NEXT: bnez a5, .LBB2_2
; RV32-NEXT: # %bb.1: # %_udiv-special-cases
-; RV32-NEXT: srli a0, s8, 1
-; RV32-NEXT: or a0, s8, a0
-; RV32-NEXT: srli a3, a0, 2
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 8
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 16
-; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a0, ra, 1
+; RV32-NEXT: or a0, ra, a0
+; RV32-NEXT: srli a6, a0, 2
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 4
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 8
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 16
+; RV32-NEXT: or a0, a0, a6
; RV32-NEXT: not a0, a0
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a0, a0, a3
-; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a6, a0, 1
+; RV32-NEXT: and a6, a6, t4
+; RV32-NEXT: sub a0, a0, a6
+; RV32-NEXT: and a6, a0, t3
; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: and a0, a0, t3
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a0, a0, t2
-; RV32-NEXT: slli a3, a0, 8
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a0, a6, a0
+; RV32-NEXT: srli a6, a0, 4
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli a6, a0, 8
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: slli a6, a0, 16
+; RV32-NEXT: add a0, a0, a6
; RV32-NEXT: srli a0, a0, 24
; RV32-NEXT: addi t6, a0, 32
; RV32-NEXT: j .LBB2_3
; RV32-NEXT: .LBB2_2:
-; RV32-NEXT: srli a0, s9, 1
-; RV32-NEXT: or a0, s9, a0
-; RV32-NEXT: srli a3, a0, 2
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 8
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 16
-; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a0, a5, 1
+; RV32-NEXT: or a0, a5, a0
+; RV32-NEXT: srli a6, a0, 2
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 4
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 8
+; RV32-NEXT: or a0, a0, a6
+; RV32-NEXT: srli a6, a0, 16
+; RV32-NEXT: or a0, a0, a6
; RV32-NEXT: not a0, a0
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a0, a0, a3
-; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli a6, a0, 1
+; RV32-NEXT: and a6, a6, t4
+; RV32-NEXT: sub a0, a0, a6
+; RV32-NEXT: and a6, a0, t3
; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: and a0, a0, t3
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a0, a0, t2
-; RV32-NEXT: slli a3, a0, 8
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a0, a6, a0
+; RV32-NEXT: srli a6, a0, 4
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli a6, a0, 8
+; RV32-NEXT: add a0, a0, a6
+; RV32-NEXT: slli a6, a0, 16
+; RV32-NEXT: add a0, a0, a6
; RV32-NEXT: srli t6, a0, 24
; RV32-NEXT: .LBB2_3: # %_udiv-special-cases
; RV32-NEXT: lw a6, 4(a1)
-; RV32-NEXT: or s0, s11, ra
-; RV32-NEXT: bnez ra, .LBB2_5
+; RV32-NEXT: or s1, s9, s10
+; RV32-NEXT: bnez s10, .LBB2_5
; RV32-NEXT: # %bb.4: # %_udiv-special-cases
-; RV32-NEXT: srli a0, s11, 1
-; RV32-NEXT: or a0, s11, a0
-; RV32-NEXT: srli a3, a0, 2
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 8
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 16
-; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a0, s9, 1
+; RV32-NEXT: or a0, s9, a0
+; RV32-NEXT: srli t0, a0, 2
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 4
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 8
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 16
+; RV32-NEXT: or a0, a0, t0
; RV32-NEXT: not a0, a0
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a0, a0, a3
-; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli t0, a0, 1
+; RV32-NEXT: and t0, t0, t4
+; RV32-NEXT: sub a0, a0, t0
+; RV32-NEXT: and t0, a0, t3
; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: and a0, a0, t3
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a0, a0, t2
-; RV32-NEXT: slli a3, a0, 8
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a0, t0, a0
+; RV32-NEXT: srli t0, a0, 4
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli t0, a0, 8
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: slli t0, a0, 16
+; RV32-NEXT: add a0, a0, t0
; RV32-NEXT: srli a0, a0, 24
; RV32-NEXT: addi t5, a0, 32
; RV32-NEXT: j .LBB2_6
; RV32-NEXT: .LBB2_5:
-; RV32-NEXT: srli a0, ra, 1
-; RV32-NEXT: or a0, ra, a0
-; RV32-NEXT: srli a3, a0, 2
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 8
-; RV32-NEXT: or a0, a0, a3
-; RV32-NEXT: srli a3, a0, 16
-; RV32-NEXT: or a0, a0, a3
+; RV32-NEXT: srli a0, s10, 1
+; RV32-NEXT: or a0, s10, a0
+; RV32-NEXT: srli t0, a0, 2
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 4
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 8
+; RV32-NEXT: or a0, a0, t0
+; RV32-NEXT: srli t0, a0, 16
+; RV32-NEXT: or a0, a0, t0
; RV32-NEXT: not a0, a0
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a0, a0, a3
-; RV32-NEXT: and a3, a0, t3
+; RV32-NEXT: srli t0, a0, 1
+; RV32-NEXT: and t0, t0, t4
+; RV32-NEXT: sub a0, a0, t0
+; RV32-NEXT: and t0, a0, t3
; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: and a0, a0, t3
-; RV32-NEXT: add a0, a3, a0
-; RV32-NEXT: srli a3, a0, 4
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: and a0, a0, t2
-; RV32-NEXT: slli a3, a0, 8
-; RV32-NEXT: add a0, a0, a3
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a0, t0, a0
+; RV32-NEXT: srli t0, a0, 4
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli t0, a0, 8
+; RV32-NEXT: add a0, a0, t0
+; RV32-NEXT: slli t0, a0, 16
+; RV32-NEXT: add a0, a0, t0
; RV32-NEXT: srli t5, a0, 24
; RV32-NEXT: .LBB2_6: # %_udiv-special-cases
-; RV32-NEXT: lw a7, 12(a1)
-; RV32-NEXT: addi a0, t6, 64
-; RV32-NEXT: bnez s0, .LBB2_8
+; RV32-NEXT: lw t0, 12(a1)
+; RV32-NEXT: addi s0, t6, 64
+; RV32-NEXT: bnez s1, .LBB2_8
; RV32-NEXT: # %bb.7: # %_udiv-special-cases
-; RV32-NEXT: mv t5, a0
+; RV32-NEXT: mv t5, s0
; RV32-NEXT: .LBB2_8: # %_udiv-special-cases
-; RV32-NEXT: lw t1, 0(a1)
-; RV32-NEXT: lw t0, 8(a1)
-; RV32-NEXT: snez s3, s0
+; RV32-NEXT: lw t2, 0(a1)
+; RV32-NEXT: lw t1, 8(a1)
+; RV32-NEXT: snez a1, s1
; RV32-NEXT: bnez a6, .LBB2_10
; RV32-NEXT: # %bb.9: # %_udiv-special-cases
-; RV32-NEXT: srli a1, t1, 1
-; RV32-NEXT: or a1, t1, a1
-; RV32-NEXT: srli a3, a1, 2
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 4
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 8
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 16
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: not a1, a1
-; RV32-NEXT: srli a3, a1, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a1, a1, a3
-; RV32-NEXT: and a3, a1, t3
-; RV32-NEXT: srli a1, a1, 2
-; RV32-NEXT: and a1, a1, t3
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: srli a3, a1, 4
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: and a1, a1, t2
-; RV32-NEXT: slli a3, a1, 8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: slli a3, a1, 16
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: srli a1, a1, 24
-; RV32-NEXT: addi a3, a1, 32
+; RV32-NEXT: srli a0, t2, 1
+; RV32-NEXT: or a0, t2, a0
+; RV32-NEXT: srli s1, a0, 2
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 4
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 8
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 16
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli s1, a0, 1
+; RV32-NEXT: and s1, s1, t4
+; RV32-NEXT: sub a0, a0, s1
+; RV32-NEXT: and s1, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, s1, a0
+; RV32-NEXT: srli s1, a0, 4
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli s1, a0, 8
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: slli s1, a0, 16
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi a0, a0, 32
; RV32-NEXT: j .LBB2_11
; RV32-NEXT: .LBB2_10:
-; RV32-NEXT: srli a1, a6, 1
-; RV32-NEXT: or a1, a6, a1
-; RV32-NEXT: srli a3, a1, 2
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 4
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 8
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: srli a3, a1, 16
-; RV32-NEXT: or a1, a1, a3
-; RV32-NEXT: not a1, a1
-; RV32-NEXT: srli a3, a1, 1
-; RV32-NEXT: and a3, a3, t4
-; RV32-NEXT: sub a1, a1, a3
-; RV32-NEXT: and a3, a1, t3
-; RV32-NEXT: srli a1, a1, 2
-; RV32-NEXT: and a1, a1, t3
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: srli a3, a1, 4
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: and a1, a1, t2
-; RV32-NEXT: slli a3, a1, 8
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: slli a3, a1, 16
-; RV32-NEXT: add a1, a1, a3
-; RV32-NEXT: srli a3, a1, 24
+; RV32-NEXT: srli a0, a6, 1
+; RV32-NEXT: or a0, a6, a0
+; RV32-NEXT: srli s1, a0, 2
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 4
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 8
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: srli s1, a0, 16
+; RV32-NEXT: or a0, a0, s1
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli s1, a0, 1
+; RV32-NEXT: and s1, s1, t4
+; RV32-NEXT: sub a0, a0, s1
+; RV32-NEXT: and s1, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, s1, a0
+; RV32-NEXT: srli s1, a0, 4
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli s1, a0, 8
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: slli s1, a0, 16
+; RV32-NEXT: add a0, a0, s1
+; RV32-NEXT: srli a0, a0, 24
; RV32-NEXT: .LBB2_11: # %_udiv-special-cases
-; RV32-NEXT: or a1, s9, ra
-; RV32-NEXT: or s0, s8, s11
-; RV32-NEXT: or s1, a6, a7
-; RV32-NEXT: or s2, t1, t0
-; RV32-NEXT: sltu t6, a0, t6
-; RV32-NEXT: addi s3, s3, -1
-; RV32-NEXT: addi a0, a3, 64
-; RV32-NEXT: or s4, t0, a7
-; RV32-NEXT: sltu s5, a0, a3
-; RV32-NEXT: snez s6, s4
-; RV32-NEXT: addi s6, s6, -1
-; RV32-NEXT: bnez a7, .LBB2_13
+; RV32-NEXT: or s1, a5, s10
+; RV32-NEXT: or s2, ra, s9
+; RV32-NEXT: or s3, a6, t0
+; RV32-NEXT: or s4, t2, t1
+; RV32-NEXT: sltu t6, s0, t6
+; RV32-NEXT: addi s0, a1, -1
+; RV32-NEXT: addi a1, a0, 64
+; RV32-NEXT: or s5, t1, t0
+; RV32-NEXT: sltu s6, a1, a0
+; RV32-NEXT: snez s7, s5
+; RV32-NEXT: addi s7, s7, -1
+; RV32-NEXT: bnez t0, .LBB2_13
; RV32-NEXT: # %bb.12: # %_udiv-special-cases
-; RV32-NEXT: srli a3, t0, 1
-; RV32-NEXT: or a3, t0, a3
-; RV32-NEXT: srli a4, a3, 2
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 4
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 8
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 16
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: not a3, a3
-; RV32-NEXT: srli a4, a3, 1
-; RV32-NEXT: and a4, a4, t4
-; RV32-NEXT: sub a3, a3, a4
-; RV32-NEXT: and a4, a3, t3
-; RV32-NEXT: srli a3, a3, 2
-; RV32-NEXT: and a3, a3, t3
-; RV32-NEXT: add a3, a4, a3
-; RV32-NEXT: srli a4, a3, 4
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: and a3, a3, t2
-; RV32-NEXT: slli a4, a3, 8
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: slli a4, a3, 16
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli a3, a3, 24
-; RV32-NEXT: addi a3, a3, 32
+; RV32-NEXT: srli a0, t1, 1
+; RV32-NEXT: or a0, t1, a0
+; RV32-NEXT: srli s8, a0, 2
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 4
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 8
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 16
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli s8, a0, 1
+; RV32-NEXT: and t4, s8, t4
+; RV32-NEXT: sub a0, a0, t4
+; RV32-NEXT: and t4, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, t4, a0
+; RV32-NEXT: srli t3, a0, 4
+; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli a7, a0, 8
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: slli a7, a0, 16
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi a0, a0, 32
; RV32-NEXT: j .LBB2_14
; RV32-NEXT: .LBB2_13:
-; RV32-NEXT: srli a3, a7, 1
-; RV32-NEXT: or a3, a7, a3
-; RV32-NEXT: srli a4, a3, 2
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 4
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 8
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: srli a4, a3, 16
-; RV32-NEXT: or a3, a3, a4
-; RV32-NEXT: not a3, a3
-; RV32-NEXT: srli a4, a3, 1
-; RV32-NEXT: and a4, a4, t4
-; RV32-NEXT: sub a3, a3, a4
-; RV32-NEXT: and a4, a3, t3
-; RV32-NEXT: srli a3, a3, 2
-; RV32-NEXT: and a3, a3, t3
-; RV32-NEXT: add a3, a4, a3
-; RV32-NEXT: srli a4, a3, 4
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: and a3, a3, t2
-; RV32-NEXT: slli a4, a3, 8
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: slli a4, a3, 16
-; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: srli a0, t0, 1
+; RV32-NEXT: or a0, t0, a0
+; RV32-NEXT: srli s8, a0, 2
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 4
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 8
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: srli s8, a0, 16
+; RV32-NEXT: or a0, a0, s8
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli s8, a0, 1
+; RV32-NEXT: and t4, s8, t4
+; RV32-NEXT: sub a0, a0, t4
+; RV32-NEXT: and t4, a0, t3
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: add a0, t4, a0
+; RV32-NEXT: srli t3, a0, 4
+; RV32-NEXT: add a0, a0, t3
+; RV32-NEXT: and a0, a0, a7
+; RV32-NEXT: slli a7, a0, 8
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: slli a7, a0, 16
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: srli a0, a0, 24
; RV32-NEXT: .LBB2_14: # %_udiv-special-cases
-; RV32-NEXT: or s0, s0, a1
-; RV32-NEXT: or a5, s2, s1
-; RV32-NEXT: and a1, s3, t6
-; RV32-NEXT: and a4, s6, s5
-; RV32-NEXT: bnez s4, .LBB2_16
+; RV32-NEXT: or t4, s2, s1
+; RV32-NEXT: or s1, s4, s3
+; RV32-NEXT: and a7, s0, t6
+; RV32-NEXT: and t3, s7, s6
+; RV32-NEXT: bnez s5, .LBB2_16
; RV32-NEXT: # %bb.15: # %_udiv-special-cases
-; RV32-NEXT: mv a3, a0
+; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB2_16: # %_udiv-special-cases
-; RV32-NEXT: seqz a0, s0
-; RV32-NEXT: seqz a5, a5
-; RV32-NEXT: sltu t2, t5, a3
-; RV32-NEXT: sub t4, a1, a4
-; RV32-NEXT: mv t3, t2
-; RV32-NEXT: beq a1, a4, .LBB2_18
+; RV32-NEXT: seqz a1, t4
+; RV32-NEXT: seqz t4, s1
+; RV32-NEXT: sltu t6, t5, a0
+; RV32-NEXT: sub s1, a7, t3
+; RV32-NEXT: mv s0, t6
+; RV32-NEXT: beq a7, t3, .LBB2_18
; RV32-NEXT: # %bb.17: # %_udiv-special-cases
-; RV32-NEXT: sltu t3, a1, a4
+; RV32-NEXT: sltu s0, a7, t3
; RV32-NEXT: .LBB2_18: # %_udiv-special-cases
-; RV32-NEXT: sub t2, t4, t2
-; RV32-NEXT: or a0, a0, a5
-; RV32-NEXT: neg t4, t3
-; RV32-NEXT: seqz t6, t3
-; RV32-NEXT: addi t6, t6, -1
-; RV32-NEXT: or a1, t4, t6
-; RV32-NEXT: sub t3, t5, a3
-; RV32-NEXT: beqz a1, .LBB2_20
+; RV32-NEXT: sub t3, s1, t6
+; RV32-NEXT: or a1, a1, t4
+; RV32-NEXT: neg t6, s0
+; RV32-NEXT: seqz s0, s0
+; RV32-NEXT: addi s0, s0, -1
+; RV32-NEXT: or a7, t6, s0
+; RV32-NEXT: sub t4, t5, a0
+; RV32-NEXT: beqz a7, .LBB2_20
; RV32-NEXT: # %bb.19: # %_udiv-special-cases
-; RV32-NEXT: snez a1, a1
+; RV32-NEXT: snez a0, a7
; RV32-NEXT: j .LBB2_21
; RV32-NEXT: .LBB2_20:
-; RV32-NEXT: snez a1, t2
-; RV32-NEXT: sltiu a3, t3, 128
-; RV32-NEXT: xori a3, a3, 1
-; RV32-NEXT: or a1, a3, a1
+; RV32-NEXT: snez a0, t3
+; RV32-NEXT: sltiu a7, t4, 128
+; RV32-NEXT: xori a7, a7, 1
+; RV32-NEXT: or a0, a7, a0
; RV32-NEXT: .LBB2_21: # %_udiv-special-cases
-; RV32-NEXT: or a5, a0, a1
-; RV32-NEXT: addi a3, a5, -1
-; RV32-NEXT: and a0, a3, a7
-; RV32-NEXT: and a1, a3, t0
-; RV32-NEXT: and a4, a3, a6
-; RV32-NEXT: and a3, a3, t1
-; RV32-NEXT: bnez a5, .LBB2_26
+; RV32-NEXT: or s1, a1, a0
+; RV32-NEXT: addi a1, s1, -1
+; RV32-NEXT: and a7, a1, t0
+; RV32-NEXT: and t5, a1, t1
+; RV32-NEXT: and a0, a1, a6
+; RV32-NEXT: and a1, a1, t2
+; RV32-NEXT: bnez s1, .LBB2_25
; RV32-NEXT: # %bb.22: # %_udiv-special-cases
-; RV32-NEXT: xori a5, t3, 127
-; RV32-NEXT: or a5, a5, t4
-; RV32-NEXT: or t5, t2, t6
-; RV32-NEXT: or a5, a5, t5
-; RV32-NEXT: beqz a5, .LBB2_26
+; RV32-NEXT: xori s1, t4, 127
+; RV32-NEXT: or s1, s1, t6
+; RV32-NEXT: or s2, t3, s0
+; RV32-NEXT: or s1, s1, s2
+; RV32-NEXT: beqz s1, .LBB2_25
; RV32-NEXT: # %bb.23: # %udiv-bb1
-; RV32-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi a1, t3, 1
-; RV32-NEXT: sw zero, 72(sp)
-; RV32-NEXT: sw zero, 76(sp)
-; RV32-NEXT: sw zero, 80(sp)
-; RV32-NEXT: sw zero, 84(sp)
-; RV32-NEXT: sw t1, 88(sp)
-; RV32-NEXT: sw a6, 92(sp)
-; RV32-NEXT: sw t0, 96(sp)
-; RV32-NEXT: sw a7, 100(sp)
+; RV32-NEXT: sw a4, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a7, t4, 1
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw zero, 60(sp)
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw zero, 68(sp)
+; RV32-NEXT: sw t2, 72(sp)
+; RV32-NEXT: sw a6, 76(sp)
+; RV32-NEXT: sw t1, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
; RV32-NEXT: li a0, 127
-; RV32-NEXT: addi a2, sp, 88
-; RV32-NEXT: seqz a3, a1
-; RV32-NEXT: sub a0, a0, t3
-; RV32-NEXT: add t2, t2, a3
-; RV32-NEXT: andi a3, a0, 31
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: seqz a4, a7
+; RV32-NEXT: sub a0, a0, t4
+; RV32-NEXT: add t3, t3, a4
+; RV32-NEXT: andi a4, a0, 31
; RV32-NEXT: srli a0, a0, 3
-; RV32-NEXT: or a4, a1, t2
-; RV32-NEXT: xori a3, a3, 31
+; RV32-NEXT: or t5, a7, t3
+; RV32-NEXT: xori a4, a4, 31
; RV32-NEXT: andi a0, a0, 12
-; RV32-NEXT: seqz t5, a4
+; RV32-NEXT: seqz t5, t5
; RV32-NEXT: sub a2, a2, a0
-; RV32-NEXT: add t5, t4, t5
+; RV32-NEXT: add t5, t6, t5
; RV32-NEXT: lw a0, 0(a2)
-; RV32-NEXT: lw a4, 4(a2)
-; RV32-NEXT: lw a5, 8(a2)
+; RV32-NEXT: lw s1, 4(a2)
+; RV32-NEXT: lw s3, 8(a2)
; RV32-NEXT: lw a2, 12(a2)
-; RV32-NEXT: sltu t4, t5, t4
-; RV32-NEXT: or s0, a1, t5
-; RV32-NEXT: add t4, t6, t4
-; RV32-NEXT: or t6, t2, t4
-; RV32-NEXT: or s0, s0, t6
-; RV32-NEXT: srli t6, a5, 1
-; RV32-NEXT: srli s1, a4, 1
-; RV32-NEXT: srli s2, a0, 1
-; RV32-NEXT: srl t6, t6, a3
-; RV32-NEXT: srl s1, s1, a3
-; RV32-NEXT: srl a3, s2, a3
-; RV32-NEXT: not t3, t3
-; RV32-NEXT: sll a2, a2, t3
-; RV32-NEXT: or s2, a2, t6
-; RV32-NEXT: sll a2, a5, t3
-; RV32-NEXT: sll a4, a4, t3
-; RV32-NEXT: or s1, a2, s1
-; RV32-NEXT: or t6, a4, a3
-; RV32-NEXT: sll t3, a0, t3
-; RV32-NEXT: bnez s0, .LBB2_27
-; RV32-NEXT: # %bb.24:
-; RV32-NEXT: li s6, 0
-; RV32-NEXT: li s7, 0
-; RV32-NEXT: li s8, 0
-; RV32-NEXT: .LBB2_25: # %udiv-loop-exit
-; RV32-NEXT: srli a0, s1, 31
+; RV32-NEXT: sltu t6, t5, t6
+; RV32-NEXT: or s2, a7, t5
+; RV32-NEXT: add t6, s0, t6
+; RV32-NEXT: or s0, t3, t6
+; RV32-NEXT: or s0, s2, s0
+; RV32-NEXT: srli s2, s3, 1
+; RV32-NEXT: srli s4, s1, 1
+; RV32-NEXT: srli s5, a0, 1
+; RV32-NEXT: srl s2, s2, a4
+; RV32-NEXT: srl s4, s4, a4
+; RV32-NEXT: srl a4, s5, a4
+; RV32-NEXT: not t4, t4
+; RV32-NEXT: sll a2, a2, t4
+; RV32-NEXT: or s2, a2, s2
+; RV32-NEXT: sll a2, s3, t4
+; RV32-NEXT: or a2, a2, s4
+; RV32-NEXT: sll s1, s1, t4
+; RV32-NEXT: or s1, s1, a4
+; RV32-NEXT: sll t4, a0, t4
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: bnez s0, .LBB2_26
+; RV32-NEXT: .LBB2_24: # %udiv-loop-exit
+; RV32-NEXT: srli a0, t4, 31
+; RV32-NEXT: slli a3, s1, 1
+; RV32-NEXT: srli s1, s1, 31
+; RV32-NEXT: or a0, a3, a0
+; RV32-NEXT: slli a3, a2, 1
+; RV32-NEXT: srli s0, a2, 31
; RV32-NEXT: slli s2, s2, 1
-; RV32-NEXT: or a0, s2, a0
-; RV32-NEXT: srli a1, t6, 31
-; RV32-NEXT: slli s1, s1, 1
-; RV32-NEXT: or a1, s1, a1
-; RV32-NEXT: srli a2, t3, 31
-; RV32-NEXT: slli t6, t6, 1
-; RV32-NEXT: slli a3, t3, 1
-; RV32-NEXT: or a3, s0, a3
-; RV32-NEXT: or a2, s6, a2
-; RV32-NEXT: or a4, a2, t6
-; RV32-NEXT: or a1, s7, a1
-; RV32-NEXT: or a0, s8, a0
-; RV32-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: .LBB2_26: # %udiv-end
-; RV32-NEXT: sw a3, 0(s7)
-; RV32-NEXT: sw a4, 4(s7)
-; RV32-NEXT: sw a1, 8(s7)
-; RV32-NEXT: sw a0, 12(s7)
-; RV32-NEXT: lw ra, 156(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 152(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 148(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 144(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 140(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 136(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s5, 132(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s6, 128(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s7, 124(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s8, 120(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s9, 116(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s10, 112(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s11, 108(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 160
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: or t5, a3, s1
+; RV32-NEXT: or a7, s2, s0
+; RV32-NEXT: or a1, a1, t4
+; RV32-NEXT: lw a4, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: .LBB2_25: # %udiv-end
+; RV32-NEXT: sw a1, 0(a4)
+; RV32-NEXT: sw a0, 4(a4)
+; RV32-NEXT: sw t5, 8(a4)
+; RV32-NEXT: sw a7, 12(a4)
+; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 144
; RV32-NEXT: ret
-; RV32-NEXT: .LBB2_27: # %udiv-preheader
-; RV32-NEXT: li s0, 0
-; RV32-NEXT: li s5, 0
+; RV32-NEXT: .LBB2_26: # %udiv-preheader
; RV32-NEXT: li s3, 0
; RV32-NEXT: li s4, 0
-; RV32-NEXT: sw zero, 56(sp)
-; RV32-NEXT: sw zero, 60(sp)
-; RV32-NEXT: sw zero, 64(sp)
-; RV32-NEXT: sw zero, 68(sp)
-; RV32-NEXT: sw t1, 40(sp)
-; RV32-NEXT: sw a6, 44(sp)
-; RV32-NEXT: sw t0, 48(sp)
-; RV32-NEXT: sw a7, 52(sp)
-; RV32-NEXT: srli a0, a1, 3
-; RV32-NEXT: addi a2, sp, 40
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw zero, 44(sp)
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw zero, 52(sp)
+; RV32-NEXT: sw t2, 24(sp)
+; RV32-NEXT: sw a6, 28(sp)
+; RV32-NEXT: sw t1, 32(sp)
+; RV32-NEXT: sw t0, 36(sp)
+; RV32-NEXT: srli a0, a7, 3
+; RV32-NEXT: addi a4, sp, 24
; RV32-NEXT: andi a0, a0, 12
-; RV32-NEXT: add a0, a2, a0
-; RV32-NEXT: lw a2, 4(a0)
-; RV32-NEXT: lw a3, 8(a0)
-; RV32-NEXT: lw a4, 12(a0)
+; RV32-NEXT: add a0, a4, a0
+; RV32-NEXT: lw a4, 4(a0)
+; RV32-NEXT: lw a6, 8(a0)
+; RV32-NEXT: lw t2, 12(a0)
; RV32-NEXT: lw a0, 0(a0)
-; RV32-NEXT: andi a5, a1, 31
-; RV32-NEXT: xori a5, a5, 31
-; RV32-NEXT: slli a6, a4, 1
-; RV32-NEXT: slli a7, a3, 1
-; RV32-NEXT: slli t0, a2, 1
-; RV32-NEXT: sll a6, a6, a5
-; RV32-NEXT: sll a7, a7, a5
-; RV32-NEXT: sll a5, t0, a5
-; RV32-NEXT: seqz t0, s8
-; RV32-NEXT: srl a3, a3, a1
-; RV32-NEXT: or s10, a3, a6
-; RV32-NEXT: or a3, s8, s9
-; RV32-NEXT: sw s9, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sub a6, s9, t0
-; RV32-NEXT: seqz a3, a3
-; RV32-NEXT: srl a2, a2, a1
-; RV32-NEXT: or s9, a2, a7
-; RV32-NEXT: sub a7, s11, a3
-; RV32-NEXT: sw s11, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sltu a2, s11, a3
-; RV32-NEXT: sw ra, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: sub a2, ra, a2
-; RV32-NEXT: sw a2, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT: srl a0, a0, a1
-; RV32-NEXT: srl ra, a4, a1
-; RV32-NEXT: or t1, a0, a5
-; RV32-NEXT: sw s8, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s8, s8, -1
-; RV32-NEXT: sw s8, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: li s7, 0
-; RV32-NEXT: li s8, 0
-; RV32-NEXT: j .LBB2_29
-; RV32-NEXT: .LBB2_28: # %udiv-do-while
-; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
-; RV32-NEXT: li s6, 0
-; RV32-NEXT: sub a0, a0, a5
-; RV32-NEXT: srli a5, s1, 31
+; RV32-NEXT: andi t0, a7, 31
+; RV32-NEXT: xori t0, t0, 31
+; RV32-NEXT: slli t1, t2, 1
+; RV32-NEXT: slli s0, a6, 1
+; RV32-NEXT: slli s6, a4, 1
+; RV32-NEXT: sll t1, t1, t0
+; RV32-NEXT: sll s0, s0, t0
+; RV32-NEXT: sll s8, s6, t0
+; RV32-NEXT: seqz t0, ra
+; RV32-NEXT: srl a6, a6, a7
+; RV32-NEXT: or s6, a6, t1
+; RV32-NEXT: or t1, ra, a5
+; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub a6, a5, t0
+; RV32-NEXT: seqz t1, t1
+; RV32-NEXT: srl a4, a4, a7
+; RV32-NEXT: or s7, a4, s0
+; RV32-NEXT: sub t0, s9, t1
+; RV32-NEXT: mv a3, s9
+; RV32-NEXT: sltu a4, s9, t1
+; RV32-NEXT: mv t1, s10
+; RV32-NEXT: sub a4, s10, a4
+; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: srl a0, a0, a7
+; RV32-NEXT: srl s9, t2, a7
+; RV32-NEXT: or s8, a0, s8
+; RV32-NEXT: addi a0, ra, -1
+; RV32-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: j .LBB2_28
+; RV32-NEXT: .LBB2_27: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT: srli s0, a2, 31
; RV32-NEXT: slli s2, s2, 1
-; RV32-NEXT: or a5, s2, a5
-; RV32-NEXT: srli s2, t6, 31
+; RV32-NEXT: sub a0, s11, a0
+; RV32-NEXT: srli s11, s1, 31
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: or s0, s2, s0
+; RV32-NEXT: srli s2, t4, 31
; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: or a2, a2, s11
+; RV32-NEXT: and s11, s7, t1
; RV32-NEXT: or s1, s1, s2
-; RV32-NEXT: srli s2, t3, 31
-; RV32-NEXT: slli t6, t6, 1
-; RV32-NEXT: slli t3, t3, 1
-; RV32-NEXT: or t6, t6, s2
-; RV32-NEXT: lw a2, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: and s2, s10, a2
-; RV32-NEXT: or t3, s0, t3
-; RV32-NEXT: sub a2, a3, s2
-; RV32-NEXT: sltu a3, a3, s2
-; RV32-NEXT: lw t0, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: and s0, s10, t0
-; RV32-NEXT: sub t0, s9, s0
-; RV32-NEXT: or s2, a1, t2
-; RV32-NEXT: sub s9, a0, a4
-; RV32-NEXT: seqz a0, a1
-; RV32-NEXT: sub t2, t2, a0
-; RV32-NEXT: or t6, s5, t6
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: andi s0, s10, 1
+; RV32-NEXT: and s2, s7, a3
+; RV32-NEXT: or t4, a1, t4
+; RV32-NEXT: sub a4, t2, s2
+; RV32-NEXT: sltu t2, t2, s2
+; RV32-NEXT: or s2, a7, t3
+; RV32-NEXT: sub s11, s6, s11
+; RV32-NEXT: seqz s6, a7
+; RV32-NEXT: addi a7, a7, -1
+; RV32-NEXT: andi a1, s7, 1
+; RV32-NEXT: sub s7, a0, ra
; RV32-NEXT: seqz a0, s2
+; RV32-NEXT: sub t3, t3, s6
; RV32-NEXT: or s1, s3, s1
-; RV32-NEXT: or s2, s4, a5
-; RV32-NEXT: sub s10, a2, ra
-; RV32-NEXT: sltu a2, a2, ra
-; RV32-NEXT: sub a3, t0, a3
-; RV32-NEXT: sltu a4, t5, a0
+; RV32-NEXT: or a2, s4, a2
+; RV32-NEXT: or s2, s5, s0
+; RV32-NEXT: sub s6, a4, s9
+; RV32-NEXT: sltu a4, a4, s9
+; RV32-NEXT: sub t2, s11, t2
+; RV32-NEXT: sltu s0, t5, a0
; RV32-NEXT: sub t5, t5, a0
-; RV32-NEXT: sub ra, a3, a2
-; RV32-NEXT: sub t4, t4, a4
-; RV32-NEXT: or a0, t2, t4
-; RV32-NEXT: or a2, a1, t5
-; RV32-NEXT: or a0, a2, a0
-; RV32-NEXT: sub t1, s11, t1
-; RV32-NEXT: li s5, 0
+; RV32-NEXT: sub s9, t2, a4
+; RV32-NEXT: sub t6, t6, s0
+; RV32-NEXT: or a0, t3, t6
+; RV32-NEXT: or a4, a7, t5
+; RV32-NEXT: or a0, a4, a0
+; RV32-NEXT: sub s8, s8, s10
; RV32-NEXT: li s3, 0
; RV32-NEXT: li s4, 0
-; RV32-NEXT: beqz a0, .LBB2_25
-; RV32-NEXT: .LBB2_29: # %udiv-do-while
+; RV32-NEXT: li s5, 0
+; RV32-NEXT: mv ra, a5
+; RV32-NEXT: beqz a0, .LBB2_24
+; RV32-NEXT: .LBB2_28: # %udiv-do-while
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32-NEXT: srli a0, t1, 31
-; RV32-NEXT: slli a3, s9, 1
-; RV32-NEXT: slli t1, t1, 1
-; RV32-NEXT: or a0, a3, a0
-; RV32-NEXT: srli a3, s2, 31
-; RV32-NEXT: or s11, t1, a3
-; RV32-NEXT: beq a6, a0, .LBB2_31
-; RV32-NEXT: # %bb.30: # %udiv-do-while
-; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
-; RV32-NEXT: sltu a4, a6, a0
-; RV32-NEXT: j .LBB2_32
-; RV32-NEXT: .LBB2_31: # in Loop: Header=BB2_29 Depth=1
-; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: sltu a4, a2, s11
-; RV32-NEXT: .LBB2_32: # %udiv-do-while
-; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
-; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: srli a3, s10, 31
-; RV32-NEXT: slli ra, ra, 1
-; RV32-NEXT: srli a5, s9, 31
-; RV32-NEXT: slli s10, s10, 1
-; RV32-NEXT: or s9, ra, a3
-; RV32-NEXT: or a3, s10, a5
-; RV32-NEXT: sub a5, a7, a3
-; RV32-NEXT: sltu t1, a7, a3
-; RV32-NEXT: lw t0, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: sub s6, t0, s9
-; RV32-NEXT: sltu a4, a5, a4
-; RV32-NEXT: sub a5, s6, t1
-; RV32-NEXT: sub a5, a5, a4
-; RV32-NEXT: srai s10, a5, 31
-; RV32-NEXT: and t1, s10, a2
-; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: and a5, s10, a2
-; RV32-NEXT: sltu a4, s11, t1
-; RV32-NEXT: mv ra, a4
-; RV32-NEXT: beq a0, a5, .LBB2_28
-; RV32-NEXT: # %bb.33: # %udiv-do-while
-; RV32-NEXT: # in Loop: Header=BB2_29 Depth=1
-; RV32-NEXT: sltu ra, a0, a5
-; RV32-NEXT: j .LBB2_28
+; RV32-NEXT: srli a0, s8, 31
+; RV32-NEXT: slli t2, s7, 1
+; RV32-NEXT: slli s8, s8, 1
+; RV32-NEXT: or s11, t2, a0
+; RV32-NEXT: srli a0, s2, 31
+; RV32-NEXT: or s8, s8, a0
+; RV32-NEXT: beq a6, s11, .LBB2_30
+; RV32-NEXT: # %bb.29: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT: sltu a0, a6, s11
+; RV32-NEXT: j .LBB2_31
+; RV32-NEXT: .LBB2_30: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a0, a0, s8
+; RV32-NEXT: .LBB2_31: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT: srli t2, s6, 31
+; RV32-NEXT: slli s9, s9, 1
+; RV32-NEXT: srli s7, s7, 31
+; RV32-NEXT: slli s10, s6, 1
+; RV32-NEXT: or s6, s9, t2
+; RV32-NEXT: or t2, s10, s7
+; RV32-NEXT: sub s7, t0, t2
+; RV32-NEXT: sltu s9, t0, t2
+; RV32-NEXT: lw a4, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s10, a4, s6
+; RV32-NEXT: sltu a0, s7, a0
+; RV32-NEXT: sub s7, s10, s9
+; RV32-NEXT: sub a0, s7, a0
+; RV32-NEXT: srai s7, a0, 31
+; RV32-NEXT: mv a5, ra
+; RV32-NEXT: and s10, s7, ra
+; RV32-NEXT: lw a0, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a0, s7, a0
+; RV32-NEXT: sltu ra, s8, s10
+; RV32-NEXT: mv s9, ra
+; RV32-NEXT: beq s11, a0, .LBB2_27
+; RV32-NEXT: # %bb.32: # %udiv-do-while
+; RV32-NEXT: # in Loop: Header=BB2_28 Depth=1
+; RV32-NEXT: sltu s9, s11, a0
+; RV32-NEXT: j .LBB2_27
;
; RV64-LABEL: udiv_i128:
; RV64: # %bb.0:
@@ -1055,32 +1032,147 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill
-; RV32-NEXT: mv ra, a0
+; RV32-NEXT: mv s8, a0
; RV32-NEXT: lw t2, 16(a2)
; RV32-NEXT: lw a4, 0(a2)
; RV32-NEXT: lw a5, 4(a2)
; RV32-NEXT: lw a6, 8(a2)
; RV32-NEXT: lw a0, 12(a2)
-; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 28(sp) # 4-byte Folded Spill
; RV32-NEXT: lui a0, 349525
; RV32-NEXT: lui a2, 209715
; RV32-NEXT: lui a3, 61681
; RV32-NEXT: addi t5, a0, 1365
; RV32-NEXT: addi t4, a2, 819
; RV32-NEXT: addi t3, a3, -241
-; RV32-NEXT: sw a6, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a6, 32(sp) # 4-byte Folded Spill
; RV32-NEXT: slli a0, a6, 31
; RV32-NEXT: srli a2, a5, 1
-; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a5, 24(sp) # 4-byte Folded Spill
; RV32-NEXT: slli a3, a5, 31
-; RV32-NEXT: or a0, a2, a0
-; RV32-NEXT: sw a4, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: srli a2, a4, 1
-; RV32-NEXT: or a2, a2, a3
-; RV32-NEXT: bnez a0, .LBB3_2
+; RV32-NEXT: or a6, a2, a0
+; RV32-NEXT: sw a4, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: srli a0, a4, 1
+; RV32-NEXT: or a7, a0, a3
+; RV32-NEXT: bnez a6, .LBB3_2
; RV32-NEXT: # %bb.1: # %_udiv-special-cases
-; RV32-NEXT: srli a3, a2, 1
-; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a0, a7, 1
+; RV32-NEXT: or a0, a7, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi a4, a0, 32
+; RV32-NEXT: j .LBB3_3
+; RV32-NEXT: .LBB3_2:
+; RV32-NEXT: srli a0, a6, 1
+; RV32-NEXT: or a0, a6, a0
+; RV32-NEXT: srli a2, a0, 2
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 8
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: srli a2, a0, 16
+; RV32-NEXT: or a0, a0, a2
+; RV32-NEXT: not a0, a0
+; RV32-NEXT: srli a2, a0, 1
+; RV32-NEXT: and a2, a2, t5
+; RV32-NEXT: sub a0, a0, a2
+; RV32-NEXT: and a2, a0, t4
+; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: and a0, a0, t4
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: srli a2, a0, 4
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: and a0, a0, t3
+; RV32-NEXT: slli a2, a0, 8
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: slli a2, a0, 16
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: srli a4, a0, 24
+; RV32-NEXT: .LBB3_3: # %_udiv-special-cases
+; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a0, a5, 1
+; RV32-NEXT: slli a3, t2, 31
+; RV32-NEXT: slli a5, a5, 31
+; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli t0, a2, 1
+; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: slli a2, a2, 31
+; RV32-NEXT: li s2, 64
+; RV32-NEXT: bnez a2, .LBB3_5
+; RV32-NEXT: # %bb.4: # %_udiv-special-cases
+; RV32-NEXT: li t1, 64
+; RV32-NEXT: j .LBB3_6
+; RV32-NEXT: .LBB3_5:
+; RV32-NEXT: srli t1, a2, 1
+; RV32-NEXT: or t1, a2, t1
+; RV32-NEXT: srli t6, t1, 2
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 8
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: srli t6, t1, 16
+; RV32-NEXT: or t1, t1, t6
+; RV32-NEXT: not t1, t1
+; RV32-NEXT: srli t6, t1, 1
+; RV32-NEXT: and t6, t6, t5
+; RV32-NEXT: sub t1, t1, t6
+; RV32-NEXT: and t6, t1, t4
+; RV32-NEXT: srli t1, t1, 2
+; RV32-NEXT: and t1, t1, t4
+; RV32-NEXT: add t1, t6, t1
+; RV32-NEXT: srli t6, t1, 4
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: and t1, t1, t3
+; RV32-NEXT: slli t6, t1, 8
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: slli t6, t1, 16
+; RV32-NEXT: add t1, t1, t6
+; RV32-NEXT: srli t1, t1, 24
+; RV32-NEXT: .LBB3_6: # %_udiv-special-cases
+; RV32-NEXT: or a3, a3, a0
+; RV32-NEXT: or a5, t0, a5
+; RV32-NEXT: bnez a2, .LBB3_8
+; RV32-NEXT: # %bb.7: # %_udiv-special-cases
+; RV32-NEXT: li t1, 128
+; RV32-NEXT: .LBB3_8: # %_udiv-special-cases
+; RV32-NEXT: or t0, a5, a3
+; RV32-NEXT: addi a2, a4, 64
+; RV32-NEXT: addi a0, t1, 128
+; RV32-NEXT: or a6, a6, a3
+; RV32-NEXT: or a7, a7, a5
+; RV32-NEXT: or s3, a7, a6
+; RV32-NEXT: sltu s0, a0, t1
+; RV32-NEXT: bnez s3, .LBB3_11
+; RV32-NEXT: # %bb.9: # %_udiv-special-cases
+; RV32-NEXT: mv t6, s0
+; RV32-NEXT: beqz a3, .LBB3_12
+; RV32-NEXT: .LBB3_10:
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 2
; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 4
@@ -1104,12 +1196,18 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli a3, a3, 24
-; RV32-NEXT: addi a6, a3, 32
-; RV32-NEXT: j .LBB3_3
-; RV32-NEXT: .LBB3_2:
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli s1, a3, 24
+; RV32-NEXT: beqz t0, .LBB3_13
+; RV32-NEXT: j .LBB3_14
+; RV32-NEXT: .LBB3_11:
+; RV32-NEXT: snez a6, t0
+; RV32-NEXT: sltu a4, a2, a4
+; RV32-NEXT: addi a6, a6, -1
+; RV32-NEXT: and t6, a6, a4
+; RV32-NEXT: bnez a3, .LBB3_10
+; RV32-NEXT: .LBB3_12: # %_udiv-special-cases
+; RV32-NEXT: srli a3, a5, 1
+; RV32-NEXT: or a3, a5, a3
; RV32-NEXT: srli a4, a3, 2
; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 4
@@ -1133,69 +1231,31 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli a6, a3, 24
-; RV32-NEXT: .LBB3_3: # %_udiv-special-cases
-; RV32-NEXT: lw a7, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: srli a3, a7, 1
-; RV32-NEXT: slli a5, t2, 31
-; RV32-NEXT: slli a7, a7, 31
-; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: srli t0, a4, 1
-; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: slli a4, a4, 31
-; RV32-NEXT: li s2, 64
-; RV32-NEXT: bnez a4, .LBB3_5
-; RV32-NEXT: # %bb.4: # %_udiv-special-cases
-; RV32-NEXT: li t6, 64
-; RV32-NEXT: j .LBB3_6
-; RV32-NEXT: .LBB3_5:
-; RV32-NEXT: srli t1, a4, 1
-; RV32-NEXT: or t1, a4, t1
-; RV32-NEXT: srli t6, t1, 2
-; RV32-NEXT: or t1, t1, t6
-; RV32-NEXT: srli t6, t1, 4
-; RV32-NEXT: or t1, t1, t6
-; RV32-NEXT: srli t6, t1, 8
-; RV32-NEXT: or t1, t1, t6
-; RV32-NEXT: srli t6, t1, 16
-; RV32-NEXT: or t1, t1, t6
-; RV32-NEXT: not t1, t1
-; RV32-NEXT: srli t6, t1, 1
-; RV32-NEXT: and t6, t6, t5
-; RV32-NEXT: sub t1, t1, t6
-; RV32-NEXT: and t6, t1, t4
-; RV32-NEXT: srli t1, t1, 2
-; RV32-NEXT: and t1, t1, t4
-; RV32-NEXT: add t1, t6, t1
-; RV32-NEXT: srli t6, t1, 4
-; RV32-NEXT: add t1, t1, t6
-; RV32-NEXT: and t1, t1, t3
-; RV32-NEXT: slli t6, t1, 8
-; RV32-NEXT: add t1, t1, t6
-; RV32-NEXT: slli t6, t1, 16
-; RV32-NEXT: add t1, t1, t6
-; RV32-NEXT: srli t6, t1, 24
-; RV32-NEXT: .LBB3_6: # %_udiv-special-cases
-; RV32-NEXT: or t1, a5, a3
-; RV32-NEXT: or a7, t0, a7
-; RV32-NEXT: bnez a4, .LBB3_8
-; RV32-NEXT: # %bb.7: # %_udiv-special-cases
-; RV32-NEXT: li t6, 128
-; RV32-NEXT: .LBB3_8: # %_udiv-special-cases
-; RV32-NEXT: or a5, a7, t1
-; RV32-NEXT: addi a4, a6, 64
-; RV32-NEXT: addi a3, t6, 128
-; RV32-NEXT: or a0, a0, t1
-; RV32-NEXT: or a2, a2, a7
-; RV32-NEXT: or s3, a2, a0
-; RV32-NEXT: sltu s0, a3, t6
-; RV32-NEXT: bnez s3, .LBB3_11
-; RV32-NEXT: # %bb.9: # %_udiv-special-cases
-; RV32-NEXT: mv t6, s0
-; RV32-NEXT: beqz t1, .LBB3_12
-; RV32-NEXT: .LBB3_10:
-; RV32-NEXT: srli a0, t1, 1
-; RV32-NEXT: or a0, t1, a0
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi s1, a3, 32
+; RV32-NEXT: bnez t0, .LBB3_14
+; RV32-NEXT: .LBB3_13: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a2
+; RV32-NEXT: .LBB3_14: # %_udiv-special-cases
+; RV32-NEXT: lw a7, 0(a1)
+; RV32-NEXT: lw t0, 4(a1)
+; RV32-NEXT: lw a6, 8(a1)
+; RV32-NEXT: bnez s3, .LBB3_16
+; RV32-NEXT: # %bb.15: # %_udiv-special-cases
+; RV32-NEXT: mv s1, a0
+; RV32-NEXT: .LBB3_16: # %_udiv-special-cases
+; RV32-NEXT: lw t1, 12(a1)
+; RV32-NEXT: lw a1, 16(a1)
+; RV32-NEXT: slli a0, a6, 31
+; RV32-NEXT: srli a2, t0, 1
+; RV32-NEXT: or s4, a2, a0
+; RV32-NEXT: slli a0, t0, 31
+; RV32-NEXT: srli a2, a7, 1
+; RV32-NEXT: or s5, a2, a0
+; RV32-NEXT: bnez s4, .LBB3_18
+; RV32-NEXT: # %bb.17: # %_udiv-special-cases
+; RV32-NEXT: srli a0, s5, 1
+; RV32-NEXT: or a0, s5, a0
; RV32-NEXT: srli a2, a0, 2
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: srli a2, a0, 4
@@ -1219,18 +1279,12 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: slli a2, a0, 16
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: srli s1, a0, 24
-; RV32-NEXT: beqz a5, .LBB3_13
-; RV32-NEXT: j .LBB3_14
-; RV32-NEXT: .LBB3_11:
-; RV32-NEXT: snez a0, a5
-; RV32-NEXT: sltu a2, a4, a6
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and t6, a0, a2
-; RV32-NEXT: bnez t1, .LBB3_10
-; RV32-NEXT: .LBB3_12: # %_udiv-special-cases
-; RV32-NEXT: srli a0, a7, 1
-; RV32-NEXT: or a0, a7, a0
+; RV32-NEXT: srli a0, a0, 24
+; RV32-NEXT: addi a4, a0, 32
+; RV32-NEXT: j .LBB3_19
+; RV32-NEXT: .LBB3_18:
+; RV32-NEXT: srli a0, s4, 1
+; RV32-NEXT: or a0, s4, a0
; RV32-NEXT: srli a2, a0, 2
; RV32-NEXT: or a0, a0, a2
; RV32-NEXT: srli a2, a0, 4
@@ -1254,31 +1308,63 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: slli a2, a0, 16
; RV32-NEXT: add a0, a0, a2
-; RV32-NEXT: srli a0, a0, 24
-; RV32-NEXT: addi s1, a0, 32
-; RV32-NEXT: bnez a5, .LBB3_14
-; RV32-NEXT: .LBB3_13: # %_udiv-special-cases
-; RV32-NEXT: mv s1, a4
-; RV32-NEXT: .LBB3_14: # %_udiv-special-cases
-; RV32-NEXT: lw a7, 0(a1)
-; RV32-NEXT: lw t0, 4(a1)
-; RV32-NEXT: lw a6, 8(a1)
-; RV32-NEXT: bnez s3, .LBB3_16
-; RV32-NEXT: # %bb.15: # %_udiv-special-cases
-; RV32-NEXT: mv s1, a3
-; RV32-NEXT: .LBB3_16: # %_udiv-special-cases
-; RV32-NEXT: lw t1, 12(a1)
-; RV32-NEXT: lw a1, 16(a1)
-; RV32-NEXT: slli a0, a6, 31
-; RV32-NEXT: srli a2, t0, 1
-; RV32-NEXT: or a0, a2, a0
-; RV32-NEXT: slli a2, t0, 31
-; RV32-NEXT: srli a3, a7, 1
-; RV32-NEXT: or a2, a3, a2
-; RV32-NEXT: bnez a0, .LBB3_18
-; RV32-NEXT: # %bb.17: # %_udiv-special-cases
-; RV32-NEXT: srli a3, a2, 1
-; RV32-NEXT: or a3, a2, a3
+; RV32-NEXT: srli a4, a0, 24
+; RV32-NEXT: .LBB3_19: # %_udiv-special-cases
+; RV32-NEXT: srli a0, t1, 1
+; RV32-NEXT: slli a2, a1, 31
+; RV32-NEXT: slli a3, t1, 31
+; RV32-NEXT: slli a5, a7, 31
+; RV32-NEXT: srli s6, a6, 1
+; RV32-NEXT: beqz a5, .LBB3_21
+; RV32-NEXT: # %bb.20:
+; RV32-NEXT: srli s2, a5, 1
+; RV32-NEXT: or s2, a5, s2
+; RV32-NEXT: srli s7, s2, 2
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 8
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: srli s7, s2, 16
+; RV32-NEXT: or s2, s2, s7
+; RV32-NEXT: not s2, s2
+; RV32-NEXT: srli s7, s2, 1
+; RV32-NEXT: and s7, s7, t5
+; RV32-NEXT: sub s2, s2, s7
+; RV32-NEXT: and s7, s2, t4
+; RV32-NEXT: srli s2, s2, 2
+; RV32-NEXT: and s2, s2, t4
+; RV32-NEXT: add s2, s7, s2
+; RV32-NEXT: srli s7, s2, 4
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: and s2, s2, t3
+; RV32-NEXT: slli s7, s2, 8
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: slli s7, s2, 16
+; RV32-NEXT: add s2, s2, s7
+; RV32-NEXT: srli s2, s2, 24
+; RV32-NEXT: .LBB3_21: # %_udiv-special-cases
+; RV32-NEXT: or s7, a2, a0
+; RV32-NEXT: or a3, s6, a3
+; RV32-NEXT: bnez a5, .LBB3_23
+; RV32-NEXT: # %bb.22: # %_udiv-special-cases
+; RV32-NEXT: li s2, 128
+; RV32-NEXT: .LBB3_23: # %_udiv-special-cases
+; RV32-NEXT: or a2, a3, s7
+; RV32-NEXT: addi a0, a4, 64
+; RV32-NEXT: addi s6, s2, 128
+; RV32-NEXT: or a5, s4, s7
+; RV32-NEXT: or s4, s5, a3
+; RV32-NEXT: or s5, s4, a5
+; RV32-NEXT: sltu s4, s6, s2
+; RV32-NEXT: bnez s5, .LBB3_26
+; RV32-NEXT: # %bb.24: # %_udiv-special-cases
+; RV32-NEXT: mv s2, s4
+; RV32-NEXT: snez s3, s3
+; RV32-NEXT: beqz s7, .LBB3_27
+; RV32-NEXT: .LBB3_25:
+; RV32-NEXT: srli a3, s7, 1
+; RV32-NEXT: or a3, s7, a3
; RV32-NEXT: srli a4, a3, 2
; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 4
@@ -1302,12 +1388,18 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli a3, a3, 24
-; RV32-NEXT: addi s5, a3, 32
-; RV32-NEXT: j .LBB3_19
-; RV32-NEXT: .LBB3_18:
-; RV32-NEXT: srli a3, a0, 1
-; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: srli a4, a3, 24
+; RV32-NEXT: j .LBB3_28
+; RV32-NEXT: .LBB3_26:
+; RV32-NEXT: snez a5, a2
+; RV32-NEXT: sltu a4, a0, a4
+; RV32-NEXT: addi a5, a5, -1
+; RV32-NEXT: and s2, a5, a4
+; RV32-NEXT: snez s3, s3
+; RV32-NEXT: bnez s7, .LBB3_25
+; RV32-NEXT: .LBB3_27: # %_udiv-special-cases
+; RV32-NEXT: srli a4, a3, 1
+; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 2
; RV32-NEXT: or a3, a3, a4
; RV32-NEXT: srli a4, a3, 4
@@ -1331,215 +1423,100 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: add a3, a3, a4
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: add a3, a3, a4
-; RV32-NEXT: srli s5, a3, 24
-; RV32-NEXT: .LBB3_19: # %_udiv-special-cases
-; RV32-NEXT: srli a3, t1, 1
-; RV32-NEXT: slli a4, a1, 31
-; RV32-NEXT: slli a5, t1, 31
-; RV32-NEXT: slli s4, a7, 31
-; RV32-NEXT: srli s6, a6, 1
-; RV32-NEXT: beqz s4, .LBB3_21
-; RV32-NEXT: # %bb.20:
-; RV32-NEXT: srli s2, s4, 1
-; RV32-NEXT: or s2, s4, s2
-; RV32-NEXT: srli s7, s2, 2
-; RV32-NEXT: or s2, s2, s7
-; RV32-NEXT: srli s7, s2, 4
-; RV32-NEXT: or s2, s2, s7
-; RV32-NEXT: srli s7, s2, 8
-; RV32-NEXT: or s2, s2, s7
-; RV32-NEXT: srli s7, s2, 16
-; RV32-NEXT: or s2, s2, s7
-; RV32-NEXT: not s2, s2
-; RV32-NEXT: srli s7, s2, 1
-; RV32-NEXT: and s7, s7, t5
-; RV32-NEXT: sub s2, s2, s7
-; RV32-NEXT: and s7, s2, t4
-; RV32-NEXT: srli s2, s2, 2
-; RV32-NEXT: and s2, s2, t4
-; RV32-NEXT: add s2, s7, s2
-; RV32-NEXT: srli s7, s2, 4
-; RV32-NEXT: add s2, s2, s7
-; RV32-NEXT: and s2, s2, t3
-; RV32-NEXT: slli s7, s2, 8
-; RV32-NEXT: add s2, s2, s7
-; RV32-NEXT: slli s7, s2, 16
-; RV32-NEXT: add s2, s2, s7
-; RV32-NEXT: srli s2, s2, 24
-; RV32-NEXT: .LBB3_21: # %_udiv-special-cases
-; RV32-NEXT: or s7, a4, a3
-; RV32-NEXT: or s6, s6, a5
-; RV32-NEXT: bnez s4, .LBB3_23
-; RV32-NEXT: # %bb.22: # %_udiv-special-cases
-; RV32-NEXT: li s2, 128
-; RV32-NEXT: .LBB3_23: # %_udiv-special-cases
-; RV32-NEXT: or s4, s6, s7
-; RV32-NEXT: addi a5, s5, 64
-; RV32-NEXT: addi a3, s2, 128
-; RV32-NEXT: or a0, a0, s7
-; RV32-NEXT: or a4, a2, s6
-; RV32-NEXT: or a4, a4, a0
-; RV32-NEXT: sltu a0, a3, s2
-; RV32-NEXT: bnez a4, .LBB3_26
-; RV32-NEXT: # %bb.24: # %_udiv-special-cases
-; RV32-NEXT: mv a2, a0
-; RV32-NEXT: snez s2, s3
-; RV32-NEXT: beqz s7, .LBB3_27
-; RV32-NEXT: .LBB3_25:
-; RV32-NEXT: srli s3, s7, 1
-; RV32-NEXT: or s3, s7, s3
-; RV32-NEXT: srli s5, s3, 2
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 4
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 8
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 16
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: not s3, s3
-; RV32-NEXT: srli s5, s3, 1
-; RV32-NEXT: and t5, s5, t5
-; RV32-NEXT: sub t5, s3, t5
-; RV32-NEXT: and s3, t5, t4
-; RV32-NEXT: srli t5, t5, 2
-; RV32-NEXT: and t4, t5, t4
-; RV32-NEXT: add t4, s3, t4
-; RV32-NEXT: srli t5, t4, 4
-; RV32-NEXT: add t4, t4, t5
-; RV32-NEXT: and t3, t4, t3
-; RV32-NEXT: slli t4, t3, 8
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: slli t4, t3, 16
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: srli t3, t3, 24
-; RV32-NEXT: j .LBB3_28
-; RV32-NEXT: .LBB3_26:
-; RV32-NEXT: snez a2, s4
-; RV32-NEXT: sltu s2, a5, s5
-; RV32-NEXT: addi a2, a2, -1
-; RV32-NEXT: and a2, a2, s2
-; RV32-NEXT: snez s2, s3
-; RV32-NEXT: bnez s7, .LBB3_25
-; RV32-NEXT: .LBB3_27: # %_udiv-special-cases
-; RV32-NEXT: srli s3, s6, 1
-; RV32-NEXT: or s3, s6, s3
-; RV32-NEXT: srli s5, s3, 2
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 4
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 8
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: srli s5, s3, 16
-; RV32-NEXT: or s3, s3, s5
-; RV32-NEXT: not s3, s3
-; RV32-NEXT: srli s5, s3, 1
-; RV32-NEXT: and t5, s5, t5
-; RV32-NEXT: sub t5, s3, t5
-; RV32-NEXT: and s3, t5, t4
-; RV32-NEXT: srli t5, t5, 2
-; RV32-NEXT: and t4, t5, t4
-; RV32-NEXT: add t4, s3, t4
-; RV32-NEXT: srli t5, t4, 4
-; RV32-NEXT: add t4, t4, t5
-; RV32-NEXT: and t3, t4, t3
-; RV32-NEXT: slli t4, t3, 8
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: slli t4, t3, 16
-; RV32-NEXT: add t3, t3, t4
-; RV32-NEXT: srli t3, t3, 24
-; RV32-NEXT: addi t3, t3, 32
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: addi a4, a3, 32
; RV32-NEXT: .LBB3_28: # %_udiv-special-cases
-; RV32-NEXT: xori t4, s0, 1
-; RV32-NEXT: addi s2, s2, -1
-; RV32-NEXT: bnez s4, .LBB3_30
+; RV32-NEXT: xori a3, s0, 1
+; RV32-NEXT: addi s3, s3, -1
+; RV32-NEXT: bnez a2, .LBB3_30
; RV32-NEXT: # %bb.29: # %_udiv-special-cases
-; RV32-NEXT: mv t3, a5
+; RV32-NEXT: mv a4, a0
; RV32-NEXT: .LBB3_30: # %_udiv-special-cases
; RV32-NEXT: andi s11, a1, 1
-; RV32-NEXT: andi s8, t2, 1
-; RV32-NEXT: lw a1, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: or s9, a1, a5
-; RV32-NEXT: or t2, a7, a6
-; RV32-NEXT: neg a1, t4
-; RV32-NEXT: and s0, s2, s0
-; RV32-NEXT: bnez a4, .LBB3_32
+; RV32-NEXT: andi a0, t2, 1
+; RV32-NEXT: lw a1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s9, a1, a2
+; RV32-NEXT: or a5, a7, a6
+; RV32-NEXT: neg a1, a3
+; RV32-NEXT: and t2, s3, s0
+; RV32-NEXT: bnez s5, .LBB3_32
; RV32-NEXT: # %bb.31: # %_udiv-special-cases
-; RV32-NEXT: mv t3, a3
+; RV32-NEXT: mv a4, s6
; RV32-NEXT: .LBB3_32: # %_udiv-special-cases
-; RV32-NEXT: lw a3, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: or s10, a3, a5
-; RV32-NEXT: or a5, s9, s8
-; RV32-NEXT: or t4, t0, t1
-; RV32-NEXT: or t5, t2, s11
-; RV32-NEXT: and a1, s0, a1
-; RV32-NEXT: xori a3, a0, 1
-; RV32-NEXT: snez a4, a4
-; RV32-NEXT: neg a3, a3
-; RV32-NEXT: addi a4, a4, -1
-; RV32-NEXT: and a0, a4, a0
-; RV32-NEXT: sltu a4, s1, t3
-; RV32-NEXT: and t2, a0, a3
-; RV32-NEXT: mv a3, a4
-; RV32-NEXT: beq t6, a2, .LBB3_34
+; RV32-NEXT: lw a2, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s10, a2, a3
+; RV32-NEXT: or a2, s9, a0
+; RV32-NEXT: or a3, t0, t1
+; RV32-NEXT: or t4, a5, s11
+; RV32-NEXT: and a1, t2, a1
+; RV32-NEXT: xori a5, s4, 1
+; RV32-NEXT: snez t2, s5
+; RV32-NEXT: neg a5, a5
+; RV32-NEXT: addi t2, t2, -1
+; RV32-NEXT: and t3, t2, s4
+; RV32-NEXT: sltu t2, s1, a4
+; RV32-NEXT: and t3, t3, a5
+; RV32-NEXT: mv a5, t2
+; RV32-NEXT: beq t6, s2, .LBB3_34
; RV32-NEXT: # %bb.33: # %_udiv-special-cases
-; RV32-NEXT: sltu a3, t6, a2
+; RV32-NEXT: sltu a5, t6, s2
; RV32-NEXT: .LBB3_34: # %_udiv-special-cases
-; RV32-NEXT: or a0, a5, s10
-; RV32-NEXT: or t5, t5, t4
-; RV32-NEXT: sltu t4, a1, t2
-; RV32-NEXT: mv s0, a3
-; RV32-NEXT: beq a1, t2, .LBB3_36
+; RV32-NEXT: or a2, a2, s10
+; RV32-NEXT: or a3, t4, a3
+; RV32-NEXT: sltu t5, a1, t3
+; RV32-NEXT: mv t4, a5
+; RV32-NEXT: beq a1, t3, .LBB3_36
; RV32-NEXT: # %bb.35: # %_udiv-special-cases
-; RV32-NEXT: mv s0, t4
+; RV32-NEXT: mv t4, t5
; RV32-NEXT: .LBB3_36: # %_udiv-special-cases
-; RV32-NEXT: seqz a5, a0
-; RV32-NEXT: seqz t5, t5
-; RV32-NEXT: andi a0, s0, 1
-; RV32-NEXT: sub a2, t6, a2
-; RV32-NEXT: sub a1, a1, t2
-; RV32-NEXT: sub t2, a2, a4
-; RV32-NEXT: sltu a2, a1, a3
-; RV32-NEXT: add a2, t4, a2
-; RV32-NEXT: neg t4, a2
-; RV32-NEXT: sub a4, a1, a3
-; RV32-NEXT: or a1, a4, t4
-; RV32-NEXT: sub a3, s1, t3
+; RV32-NEXT: seqz a2, a2
+; RV32-NEXT: seqz a3, a3
+; RV32-NEXT: andi t4, t4, 1
+; RV32-NEXT: sub t6, t6, s2
+; RV32-NEXT: sub a1, a1, t3
+; RV32-NEXT: sub t2, t6, t2
+; RV32-NEXT: sltu t3, a1, a5
+; RV32-NEXT: add t3, t5, t3
+; RV32-NEXT: neg t3, t3
+; RV32-NEXT: sub t5, a1, a5
+; RV32-NEXT: or a1, t5, t3
+; RV32-NEXT: sub t6, s1, a4
; RV32-NEXT: beqz a1, .LBB3_38
; RV32-NEXT: # %bb.37: # %_udiv-special-cases
; RV32-NEXT: snez a1, a1
-; RV32-NEXT: or a2, a5, t5
-; RV32-NEXT: bnez a0, .LBB3_39
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: bnez t4, .LBB3_39
; RV32-NEXT: j .LBB3_40
; RV32-NEXT: .LBB3_38:
; RV32-NEXT: snez a1, t2
-; RV32-NEXT: sltiu a2, a3, 129
-; RV32-NEXT: xori a2, a2, 1
-; RV32-NEXT: or a1, a2, a1
-; RV32-NEXT: or a2, a5, t5
-; RV32-NEXT: beqz a0, .LBB3_40
+; RV32-NEXT: sltiu a4, t6, 129
+; RV32-NEXT: xori a4, a4, 1
+; RV32-NEXT: or a1, a4, a1
+; RV32-NEXT: or a2, a2, a3
+; RV32-NEXT: beqz t4, .LBB3_40
; RV32-NEXT: .LBB3_39: # %_udiv-special-cases
-; RV32-NEXT: mv a1, a0
+; RV32-NEXT: mv a1, t4
; RV32-NEXT: .LBB3_40: # %_udiv-special-cases
-; RV32-NEXT: or t6, a2, a1
-; RV32-NEXT: addi a1, t6, -1
-; RV32-NEXT: and a2, s11, a1
-; RV32-NEXT: and a5, a1, t1
-; RV32-NEXT: and t3, a1, a6
-; RV32-NEXT: and t5, a1, t0
-; RV32-NEXT: and a1, a1, a7
-; RV32-NEXT: bnez t6, .LBB3_57
+; RV32-NEXT: or a5, a2, a1
+; RV32-NEXT: addi a4, a5, -1
+; RV32-NEXT: and s0, s11, a4
+; RV32-NEXT: and a3, a4, t1
+; RV32-NEXT: and a2, a4, a6
+; RV32-NEXT: and a1, a4, t0
+; RV32-NEXT: and a4, a4, a7
+; RV32-NEXT: bnez a5, .LBB3_57
; RV32-NEXT: # %bb.41: # %_udiv-special-cases
-; RV32-NEXT: or t6, t2, t4
-; RV32-NEXT: xori s0, a3, 128
-; RV32-NEXT: or s0, s0, a0
-; RV32-NEXT: or s0, s0, a4
-; RV32-NEXT: or t6, s0, t6
-; RV32-NEXT: beqz t6, .LBB3_57
+; RV32-NEXT: or a5, t2, t3
+; RV32-NEXT: xori s1, t6, 128
+; RV32-NEXT: or s1, s1, t4
+; RV32-NEXT: or s1, s1, t5
+; RV32-NEXT: or a5, s1, a5
+; RV32-NEXT: beqz a5, .LBB3_57
; RV32-NEXT: # %bb.42: # %udiv-bb1
-; RV32-NEXT: sw ra, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi a1, a3, 1
+; RV32-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi a1, t6, 1
; RV32-NEXT: sw zero, 136(sp)
; RV32-NEXT: sw zero, 140(sp)
; RV32-NEXT: sw zero, 144(sp)
@@ -1553,55 +1530,55 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: sw a6, 160(sp)
; RV32-NEXT: sw t1, 164(sp)
; RV32-NEXT: sw s11, 168(sp)
-; RV32-NEXT: li a5, 128
-; RV32-NEXT: addi t3, sp, 152
-; RV32-NEXT: neg a2, a3
-; RV32-NEXT: seqz t5, a1
-; RV32-NEXT: sub a5, a5, a3
-; RV32-NEXT: add t2, t2, t5
-; RV32-NEXT: andi a3, a5, 31
-; RV32-NEXT: srli t5, a5, 3
-; RV32-NEXT: or t6, a1, t2
-; RV32-NEXT: xori a5, a3, 31
-; RV32-NEXT: andi a3, t5, 28
-; RV32-NEXT: seqz t6, t6
-; RV32-NEXT: sub ra, t3, a3
-; RV32-NEXT: add t6, a4, t6
-; RV32-NEXT: lw t3, 0(ra)
-; RV32-NEXT: lw s0, 4(ra)
-; RV32-NEXT: lw s1, 8(ra)
-; RV32-NEXT: lw a3, 12(ra)
-; RV32-NEXT: sltu a4, t6, a4
-; RV32-NEXT: or t5, a1, t6
-; RV32-NEXT: add t4, t4, a4
-; RV32-NEXT: or a4, t2, t4
-; RV32-NEXT: or a4, t5, a4
-; RV32-NEXT: srli t5, s1, 1
-; RV32-NEXT: seqz s2, a4
-; RV32-NEXT: add a0, a0, s2
-; RV32-NEXT: sll s2, a3, a2
-; RV32-NEXT: srl t5, t5, a5
-; RV32-NEXT: or t5, s2, t5
-; RV32-NEXT: srli s2, s0, 1
-; RV32-NEXT: sll s1, s1, a2
-; RV32-NEXT: srl s2, s2, a5
+; RV32-NEXT: li a2, 128
+; RV32-NEXT: addi a3, sp, 152
+; RV32-NEXT: neg ra, t6
+; RV32-NEXT: seqz a4, a1
+; RV32-NEXT: sub a2, a2, t6
+; RV32-NEXT: add t2, t2, a4
+; RV32-NEXT: andi a4, a2, 31
+; RV32-NEXT: srli a2, a2, 3
+; RV32-NEXT: or a5, a1, t2
+; RV32-NEXT: xori s8, a4, 31
+; RV32-NEXT: andi a2, a2, 28
+; RV32-NEXT: seqz t6, a5
+; RV32-NEXT: sub a2, a3, a2
+; RV32-NEXT: add t6, t5, t6
+; RV32-NEXT: lw a3, 0(a2)
+; RV32-NEXT: lw a5, 4(a2)
+; RV32-NEXT: lw s1, 8(a2)
+; RV32-NEXT: lw a4, 12(a2)
+; RV32-NEXT: sltu t5, t6, t5
+; RV32-NEXT: or s0, a1, t6
+; RV32-NEXT: add t3, t3, t5
+; RV32-NEXT: or t5, t2, t3
+; RV32-NEXT: or t5, s0, t5
+; RV32-NEXT: srli s0, s1, 1
+; RV32-NEXT: seqz s2, t5
+; RV32-NEXT: add t4, t4, s2
+; RV32-NEXT: sll s2, a4, ra
+; RV32-NEXT: srl s0, s0, s8
+; RV32-NEXT: or s0, s2, s0
+; RV32-NEXT: srli s2, a5, 1
+; RV32-NEXT: sll s1, s1, ra
+; RV32-NEXT: srl s2, s2, s8
; RV32-NEXT: or s2, s1, s2
-; RV32-NEXT: srli s1, t3, 1
-; RV32-NEXT: sll s0, s0, a2
-; RV32-NEXT: srl s1, s1, a5
-; RV32-NEXT: andi s3, a0, 1
-; RV32-NEXT: or s1, s0, s1
-; RV32-NEXT: or a0, a4, s3
-; RV32-NEXT: sll t3, t3, a2
-; RV32-NEXT: beqz a0, .LBB3_55
+; RV32-NEXT: srli s1, a3, 1
+; RV32-NEXT: sll a5, a5, ra
+; RV32-NEXT: srl s3, s1, s8
+; RV32-NEXT: andi s1, t4, 1
+; RV32-NEXT: or s3, a5, s3
+; RV32-NEXT: or a5, t5, s1
+; RV32-NEXT: sll t5, a3, ra
+; RV32-NEXT: beqz a5, .LBB3_55
; RV32-NEXT: # %bb.43: # %udiv-preheader
; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s6, 0
; RV32-NEXT: li s7, 0
-; RV32-NEXT: srli a3, a3, 1
-; RV32-NEXT: lw a0, 16(ra)
+; RV32-NEXT: srli a4, a4, 1
+; RV32-NEXT: lw a2, 16(a2)
; RV32-NEXT: sw zero, 104(sp)
; RV32-NEXT: sw zero, 108(sp)
; RV32-NEXT: sw zero, 112(sp)
@@ -1618,222 +1595,209 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV32-NEXT: sw t0, 60(sp)
; RV32-NEXT: sw a6, 64(sp)
; RV32-NEXT: sw t1, 68(sp)
-; RV32-NEXT: srli a4, a1, 3
-; RV32-NEXT: addi a6, sp, 56
-; RV32-NEXT: andi a7, a1, 31
-; RV32-NEXT: or t0, s9, s10
-; RV32-NEXT: srl a3, a3, a5
-; RV32-NEXT: andi a4, a4, 28
-; RV32-NEXT: xori a5, a7, 31
-; RV32-NEXT: snez a7, t0
-; RV32-NEXT: add a4, a6, a4
-; RV32-NEXT: add a7, s8, a7
-; RV32-NEXT: lw a6, 16(a4)
-; RV32-NEXT: lw t0, 0(a4)
-; RV32-NEXT: lw t1, 4(a4)
-; RV32-NEXT: lw s0, 8(a4)
-; RV32-NEXT: lw a4, 12(a4)
-; RV32-NEXT: sll a0, a0, a2
-; RV32-NEXT: or a3, a0, a3
-; RV32-NEXT: slli a6, a6, 1
-; RV32-NEXT: slli a0, a4, 1
-; RV32-NEXT: slli a2, s0, 1
-; RV32-NEXT: slli s4, t1, 1
-; RV32-NEXT: sll a6, a6, a5
-; RV32-NEXT: sll a0, a0, a5
-; RV32-NEXT: sll s8, a2, a5
-; RV32-NEXT: sll s4, s4, a5
-; RV32-NEXT: srl a2, a4, a1
-; RV32-NEXT: or ra, a2, a6
-; RV32-NEXT: lw a6, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: seqz a4, a6
-; RV32-NEXT: srl a2, s0, a1
-; RV32-NEXT: or a2, a2, a0
-; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: or a0, a6, a5
-; RV32-NEXT: sub s5, a5, a4
-; RV32-NEXT: seqz a4, a0
-; RV32-NEXT: srl a0, t1, a1
-; RV32-NEXT: or a0, a0, s8
-; RV32-NEXT: lw a5, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: sub t1, a5, a4
-; RV32-NEXT: sw t1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sltu a4, a5, a4
-; RV32-NEXT: addi a7, a7, 1
+; RV32-NEXT: srli a3, a1, 3
+; RV32-NEXT: addi a5, sp, 56
+; RV32-NEXT: andi a6, a1, 31
+; RV32-NEXT: or a7, s9, s10
+; RV32-NEXT: srl a4, a4, s8
+; RV32-NEXT: andi a3, a3, 28
+; RV32-NEXT: xori a6, a6, 31
+; RV32-NEXT: snez a7, a7
+; RV32-NEXT: add a3, a5, a3
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: lw a5, 16(a3)
+; RV32-NEXT: lw a7, 0(a3)
+; RV32-NEXT: lw t0, 4(a3)
+; RV32-NEXT: lw t1, 8(a3)
+; RV32-NEXT: lw a3, 12(a3)
+; RV32-NEXT: sll a2, a2, ra
+; RV32-NEXT: or a2, a2, a4
+; RV32-NEXT: slli a5, a5, 1
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: slli t4, t1, 1
+; RV32-NEXT: slli s4, t0, 1
+; RV32-NEXT: sll a5, a5, a6
+; RV32-NEXT: sll a4, a4, a6
+; RV32-NEXT: sll t4, t4, a6
+; RV32-NEXT: sll a6, s4, a6
+; RV32-NEXT: srl a3, a3, a1
+; RV32-NEXT: or s9, a3, a5
+; RV32-NEXT: lw s4, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: seqz a3, s4
+; RV32-NEXT: srl a5, t1, a1
+; RV32-NEXT: or ra, a5, a4
; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: sub s6, a5, a4
-; RV32-NEXT: andi a4, a7, 1
-; RV32-NEXT: sw a4, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT: andi a5, a3, 1
-; RV32-NEXT: srl a3, t0, a1
-; RV32-NEXT: or a4, a3, s4
-; RV32-NEXT: addi a6, a6, -1
-; RV32-NEXT: sw a6, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: li s11, 0
-; RV32-NEXT: li s10, 0
+; RV32-NEXT: or a4, s4, a5
+; RV32-NEXT: sub a5, a5, a3
+; RV32-NEXT: seqz a3, a4
+; RV32-NEXT: srl a4, t0, a1
+; RV32-NEXT: or s11, a4, t4
+; RV32-NEXT: lw a4, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub t0, a4, a3
+; RV32-NEXT: sw t0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a3, a4, a3
+; RV32-NEXT: addi a0, a0, 1
+; RV32-NEXT: lw a4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub s5, a4, a3
+; RV32-NEXT: andi a0, a0, 1
+; RV32-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: andi a0, a2, 1
+; RV32-NEXT: srl a2, a7, a1
+; RV32-NEXT: or s8, a2, a6
+; RV32-NEXT: addi s4, s4, -1
+; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
; RV32-NEXT: j .LBB3_45
; RV32-NEXT: .LBB3_44: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: and s0, a5, s0
-; RV32-NEXT: xor s8, t1, a7
-; RV32-NEXT: xor s9, a2, s0
-; RV32-NEXT: or s8, s9, s8
-; RV32-NEXT: li s9, 0
-; RV32-NEXT: li s8, 0
-; RV32-NEXT: sltu s4, a2, s0
-; RV32-NEXT: sub s0, a2, s0
-; RV32-NEXT: sub a7, t1, a7
+; RV32-NEXT: lw a2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: and t1, a0, a2
+; RV32-NEXT: xor a2, a6, a3
+; RV32-NEXT: xor a7, ra, t1
+; RV32-NEXT: or a2, a7, a2
; RV32-NEXT: srli a2, s2, 31
-; RV32-NEXT: sub a0, a0, t0
-; RV32-NEXT: slli t0, t5, 1
-; RV32-NEXT: or t0, t0, a2
-; RV32-NEXT: srli a2, s1, 31
+; RV32-NEXT: sltu a7, ra, t1
+; RV32-NEXT: sub t1, ra, t1
+; RV32-NEXT: slli ra, s0, 1
+; RV32-NEXT: sub a3, a6, a3
+; RV32-NEXT: srli a6, s3, 31
; RV32-NEXT: slli s2, s2, 1
-; RV32-NEXT: or t1, s2, a2
-; RV32-NEXT: srli a2, t3, 31
-; RV32-NEXT: slli s1, s1, 1
-; RV32-NEXT: or s1, s1, a2
-; RV32-NEXT: slli t3, t3, 1
-; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: or t3, a2, t3
-; RV32-NEXT: srli a2, t5, 31
-; RV32-NEXT: or s7, s7, a2
-; RV32-NEXT: sub a2, s0, ra
-; RV32-NEXT: sltu s0, s0, ra
-; RV32-NEXT: or t5, a1, t6
-; RV32-NEXT: sub a7, a7, s4
-; RV32-NEXT: or s2, t2, t4
-; RV32-NEXT: sub a0, a0, a6
-; RV32-NEXT: or a6, a1, t2
-; RV32-NEXT: or s4, t5, s2
-; RV32-NEXT: seqz t5, a1
+; RV32-NEXT: sub a5, s11, a5
+; RV32-NEXT: srli s11, t5, 31
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: srli s0, s0, 31
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: or a2, ra, a2
+; RV32-NEXT: or t0, a1, t6
+; RV32-NEXT: or a6, s2, a6
+; RV32-NEXT: or s2, t2, t3
+; RV32-NEXT: or s3, s3, s11
+; RV32-NEXT: or t4, a1, t2
+; RV32-NEXT: lw s4, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or t5, s4, t5
+; RV32-NEXT: seqz s4, a1
; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: andi a5, a5, 1
-; RV32-NEXT: sw a5, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: seqz a6, a6
-; RV32-NEXT: sub t2, t2, t5
-; RV32-NEXT: lw a5, 48(sp) # 4-byte Folded Reload
-; RV32-NEXT: or s1, a5, s1
-; RV32-NEXT: lw a5, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: or s2, a5, t1
-; RV32-NEXT: lw a5, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: or t5, a5, t0
-; RV32-NEXT: andi a5, s7, 1
-; RV32-NEXT: sub ra, a7, s0
-; RV32-NEXT: snez a7, s4
-; RV32-NEXT: sltu t0, t6, a6
-; RV32-NEXT: sub t6, t6, a6
-; RV32-NEXT: add a7, s3, a7
-; RV32-NEXT: sub t4, t4, t0
-; RV32-NEXT: or a6, a1, t6
-; RV32-NEXT: addi a7, a7, 1
-; RV32-NEXT: or t0, t2, t4
-; RV32-NEXT: andi s3, a7, 1
-; RV32-NEXT: or a6, a6, t0
-; RV32-NEXT: or a6, a6, s3
-; RV32-NEXT: sub a4, a4, a3
+; RV32-NEXT: or s7, s7, s0
+; RV32-NEXT: andi a0, a0, 1
+; RV32-NEXT: sw a0, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: sub ra, t1, s9
+; RV32-NEXT: sltu t1, t1, s9
+; RV32-NEXT: sub a3, a3, a7
+; RV32-NEXT: sub s11, a5, a4
+; RV32-NEXT: or a4, t0, s2
+; RV32-NEXT: seqz a5, t4
+; RV32-NEXT: sub t2, t2, s4
+; RV32-NEXT: lw a0, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s3, a0, s3
+; RV32-NEXT: lw a0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: or s2, a0, a6
+; RV32-NEXT: or s0, s6, a2
+; RV32-NEXT: andi a0, s7, 1
+; RV32-NEXT: sub s9, a3, t1
+; RV32-NEXT: snez a2, a4
+; RV32-NEXT: sltu a3, t6, a5
+; RV32-NEXT: sub t6, t6, a5
+; RV32-NEXT: add a2, s1, a2
+; RV32-NEXT: sub t3, t3, a3
+; RV32-NEXT: or a3, a1, t6
+; RV32-NEXT: addi a2, a2, 1
+; RV32-NEXT: or a4, t2, t3
+; RV32-NEXT: andi s1, a2, 1
+; RV32-NEXT: or a3, a3, a4
+; RV32-NEXT: or a3, a3, s1
+; RV32-NEXT: sub s8, s10, s8
; RV32-NEXT: sw zero, 48(sp) # 4-byte Folded Spill
; RV32-NEXT: sw zero, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw zero, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: li s6, 0
; RV32-NEXT: li s7, 0
-; RV32-NEXT: beqz a6, .LBB3_56
+; RV32-NEXT: lw a5, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: beqz a3, .LBB3_56
; RV32-NEXT: .LBB3_45: # %udiv-do-while
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
-; RV32-NEXT: srli a3, a2, 31
-; RV32-NEXT: slli a6, ra, 1
-; RV32-NEXT: or t1, a6, a3
-; RV32-NEXT: srli a3, a0, 31
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: or a2, a2, a3
-; RV32-NEXT: beq s6, t1, .LBB3_47
+; RV32-NEXT: srli a2, ra, 31
+; RV32-NEXT: slli a3, s9, 1
+; RV32-NEXT: or a6, a3, a2
+; RV32-NEXT: srli a3, s11, 31
+; RV32-NEXT: slli ra, ra, 1
+; RV32-NEXT: or ra, ra, a3
+; RV32-NEXT: beq s5, a6, .LBB3_47
; RV32-NEXT: # %bb.46: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: sltu a3, s6, t1
+; RV32-NEXT: sltu a3, s5, a6
; RV32-NEXT: j .LBB3_48
; RV32-NEXT: .LBB3_47: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: lw a3, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: sltu a3, a3, a2
+; RV32-NEXT: lw a2, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a3, a2, ra
; RV32-NEXT: .LBB3_48: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: srli a6, a4, 31
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: slli a4, a4, 1
-; RV32-NEXT: or a0, a0, a6
-; RV32-NEXT: andi a5, a5, 1
-; RV32-NEXT: or a4, a4, a5
-; RV32-NEXT: beq s5, a0, .LBB3_50
+; RV32-NEXT: srli a4, s8, 31
+; RV32-NEXT: slli s11, s11, 1
+; RV32-NEXT: slli s8, s8, 1
+; RV32-NEXT: or s11, s11, a4
+; RV32-NEXT: andi a0, a0, 1
+; RV32-NEXT: or s10, s8, a0
+; RV32-NEXT: beq a5, s11, .LBB3_50
; RV32-NEXT: # %bb.49: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: sltu a5, s5, a0
+; RV32-NEXT: sltu a0, a5, s11
; RV32-NEXT: j .LBB3_51
; RV32-NEXT: .LBB3_50: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: lw a5, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: sltu a5, a5, a4
+; RV32-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a0, a0, s10
; RV32-NEXT: .LBB3_51: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: lw a6, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: xor a6, a6, a2
-; RV32-NEXT: xor a7, s6, t1
-; RV32-NEXT: or a6, a6, a7
-; RV32-NEXT: beqz a6, .LBB3_53
+; RV32-NEXT: lw a2, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: xor a4, a2, ra
+; RV32-NEXT: xor a5, s5, a6
+; RV32-NEXT: or a4, a4, a5
+; RV32-NEXT: beqz a4, .LBB3_53
; RV32-NEXT: # %bb.52: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: mv a5, a3
+; RV32-NEXT: mv a0, a3
; RV32-NEXT: .LBB3_53: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: srli a3, ra, 31
-; RV32-NEXT: lw a6, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT: sub a3, a6, a3
-; RV32-NEXT: sub a3, a3, a5
-; RV32-NEXT: slli a3, a3, 31
-; RV32-NEXT: srai a5, a3, 31
-; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: and a7, a5, a3
-; RV32-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: and a3, a5, a3
-; RV32-NEXT: lw a6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT: and t0, a5, a6
-; RV32-NEXT: sltu a6, a4, a3
-; RV32-NEXT: mv ra, a6
-; RV32-NEXT: beq a0, t0, .LBB3_44
+; RV32-NEXT: srli a3, s9, 31
+; RV32-NEXT: lw a2, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: sub a3, a2, a3
+; RV32-NEXT: sub a3, a3, a0
+; RV32-NEXT: slli a0, a3, 31
+; RV32-NEXT: srai a0, a0, 31
+; RV32-NEXT: lw a3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a3, a0, a3
+; RV32-NEXT: lw a2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: and s8, a0, a2
+; RV32-NEXT: lw a5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: and a5, a0, a5
+; RV32-NEXT: sltu a4, s10, s8
+; RV32-NEXT: mv s9, a4
+; RV32-NEXT: beq s11, a5, .LBB3_44
; RV32-NEXT: # %bb.54: # %udiv-do-while
; RV32-NEXT: # in Loop: Header=BB3_45 Depth=1
-; RV32-NEXT: sltu ra, a0, t0
+; RV32-NEXT: sltu s9, s11, a5
; RV32-NEXT: j .LBB3_44
; RV32-NEXT: .LBB3_55:
; RV32-NEXT: sw zero, 52(sp) # 4-byte Folded Spill
-; RV32-NEXT: li s11, 0
-; RV32-NEXT: li s9, 0
-; RV32-NEXT: li s10, 0
-; RV32-NEXT: li s8, 0
; RV32-NEXT: .LBB3_56: # %udiv-loop-exit
-; RV32-NEXT: srli a0, s2, 31
-; RV32-NEXT: slli a1, t5, 1
-; RV32-NEXT: or a0, a1, a0
-; RV32-NEXT: srli a1, s1, 31
-; RV32-NEXT: slli s2, s2, 1
-; RV32-NEXT: or a2, s2, a1
-; RV32-NEXT: srli a3, t3, 31
-; RV32-NEXT: slli s1, s1, 1
-; RV32-NEXT: srli a4, t5, 31
-; RV32-NEXT: slli t3, t3, 1
-; RV32-NEXT: lw a1, 52(sp) # 4-byte Folded Reload
-; RV32-NEXT: or a1, a1, t3
-; RV32-NEXT: or a3, s11, a3
-; RV32-NEXT: or a4, s8, a4
-; RV32-NEXT: or t5, a3, s1
-; RV32-NEXT: or t3, s9, a2
-; RV32-NEXT: or a5, s10, a0
-; RV32-NEXT: andi a2, a4, 1
-; RV32-NEXT: lw ra, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: srli a0, t5, 31
+; RV32-NEXT: slli a1, s3, 1
+; RV32-NEXT: srli a2, s3, 31
+; RV32-NEXT: or a1, a1, a0
+; RV32-NEXT: slli a0, s2, 1
+; RV32-NEXT: srli a3, s2, 31
+; RV32-NEXT: or a2, a0, a2
+; RV32-NEXT: slli a0, s0, 1
+; RV32-NEXT: srli s0, s0, 31
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: or a3, a0, a3
+; RV32-NEXT: lw a0, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: or a4, a0, t5
+; RV32-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
; RV32-NEXT: .LBB3_57: # %udiv-end
-; RV32-NEXT: sw a1, 0(ra)
-; RV32-NEXT: sw t5, 4(ra)
-; RV32-NEXT: sw t3, 8(ra)
-; RV32-NEXT: sw a5, 12(ra)
-; RV32-NEXT: andi a2, a2, 1
-; RV32-NEXT: sb a2, 16(ra)
+; RV32-NEXT: sw a4, 0(s8)
+; RV32-NEXT: sw a1, 4(s8)
+; RV32-NEXT: sw a2, 8(s8)
+; RV32-NEXT: sw a3, 12(s8)
+; RV32-NEXT: sb s0, 16(s8)
; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload
@@ -1939,14 +1903,13 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV64-NEXT: add a2, a2, a5
; RV64-NEXT: srli t2, a2, 56
; RV64-NEXT: .LBB3_3: # %_udiv-special-cases
-; RV64-NEXT: addi sp, sp, -192
-; RV64-NEXT: sd s0, 184(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 176(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 168(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 160(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 152(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s5, 144(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s6, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi sp, sp, -176
+; RV64-NEXT: sd s0, 168(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 160(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 152(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 144(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 136(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 128(sp) # 8-byte Folded Spill
; RV64-NEXT: slli a2, a3, 63
; RV64-NEXT: li t5, 128
; RV64-NEXT: bnez a2, .LBB3_5
@@ -2131,29 +2094,29 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV64-NEXT: or a1, a1, a6
; RV64-NEXT: andi a6, t6, 1
; RV64-NEXT: sub a7, a7, t0
-; RV64-NEXT: sub t5, a7, t5
+; RV64-NEXT: sub t0, a7, t5
; RV64-NEXT: sub a7, t2, s4
; RV64-NEXT: beqz a6, .LBB3_19
; RV64-NEXT: # %bb.18: # %_udiv-special-cases
-; RV64-NEXT: mv t0, a6
+; RV64-NEXT: mv t2, a6
; RV64-NEXT: j .LBB3_20
; RV64-NEXT: .LBB3_19:
-; RV64-NEXT: sltiu t0, a7, 129
-; RV64-NEXT: xori t0, t0, 1
-; RV64-NEXT: snez t2, t5
-; RV64-NEXT: or t0, t0, t2
+; RV64-NEXT: sltiu t2, a7, 129
+; RV64-NEXT: xori t2, t2, 1
+; RV64-NEXT: snez t5, t0
+; RV64-NEXT: or t2, t2, t5
; RV64-NEXT: .LBB3_20: # %_udiv-special-cases
-; RV64-NEXT: or t6, a1, t0
-; RV64-NEXT: addi a1, t6, -1
-; RV64-NEXT: and t2, t4, a1
-; RV64-NEXT: and t0, a1, a2
-; RV64-NEXT: and a1, a1, a5
-; RV64-NEXT: bnez t6, .LBB3_30
+; RV64-NEXT: or t6, a1, t2
+; RV64-NEXT: addi t5, t6, -1
+; RV64-NEXT: and a1, t4, t5
+; RV64-NEXT: and t2, t5, a2
+; RV64-NEXT: and t5, t5, a5
+; RV64-NEXT: bnez t6, .LBB3_29
; RV64-NEXT: # %bb.21: # %_udiv-special-cases
; RV64-NEXT: xori t6, a7, 128
; RV64-NEXT: or t6, t6, a6
-; RV64-NEXT: or t6, t6, t5
-; RV64-NEXT: beqz t6, .LBB3_30
+; RV64-NEXT: or t6, t6, t0
+; RV64-NEXT: beqz t6, .LBB3_29
; RV64-NEXT: # %bb.22: # %udiv-bb1
; RV64-NEXT: addi a1, a7, 1
; RV64-NEXT: sd zero, 64(sp)
@@ -2163,30 +2126,30 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV64-NEXT: sd a5, 96(sp)
; RV64-NEXT: sd a2, 104(sp)
; RV64-NEXT: sd t4, 112(sp)
-; RV64-NEXT: li t0, 128
-; RV64-NEXT: addi t2, sp, 96
+; RV64-NEXT: li t2, 128
+; RV64-NEXT: addi t5, sp, 96
; RV64-NEXT: neg s1, a7
; RV64-NEXT: seqz t6, a1
-; RV64-NEXT: sub a7, t0, a7
-; RV64-NEXT: add t5, t5, t6
-; RV64-NEXT: andi t0, a7, 63
+; RV64-NEXT: sub a7, t2, a7
+; RV64-NEXT: add t0, t0, t6
+; RV64-NEXT: andi t2, a7, 63
; RV64-NEXT: srli a7, a7, 3
-; RV64-NEXT: or t6, a1, t5
-; RV64-NEXT: xori s2, t0, 63
+; RV64-NEXT: or t6, a1, t0
+; RV64-NEXT: xori s2, t2, 63
; RV64-NEXT: andi a7, a7, 24
-; RV64-NEXT: seqz t0, t6
-; RV64-NEXT: sub s3, t2, a7
-; RV64-NEXT: add a6, a6, t0
-; RV64-NEXT: ld t2, 0(s3)
+; RV64-NEXT: seqz t2, t6
+; RV64-NEXT: sub s3, t5, a7
+; RV64-NEXT: add a6, a6, t2
+; RV64-NEXT: ld a7, 0(s3)
; RV64-NEXT: ld s4, 8(s3)
-; RV64-NEXT: andi a7, a6, 1
-; RV64-NEXT: or t6, t6, a7
-; RV64-NEXT: srli a6, t2, 1
-; RV64-NEXT: sll t0, s4, s1
-; RV64-NEXT: srl a6, a6, s2
-; RV64-NEXT: or t0, t0, a6
-; RV64-NEXT: sll a6, t2, s1
-; RV64-NEXT: li t2, 0
+; RV64-NEXT: andi a6, a6, 1
+; RV64-NEXT: or t6, t6, a6
+; RV64-NEXT: srli t2, a7, 1
+; RV64-NEXT: sll t5, s4, s1
+; RV64-NEXT: srl t2, t2, s2
+; RV64-NEXT: or t5, t5, t2
+; RV64-NEXT: sll t2, a7, s1
+; RV64-NEXT: li a7, 0
; RV64-NEXT: beqz t6, .LBB3_28
; RV64-NEXT: # %bb.23: # %udiv-preheader
; RV64-NEXT: li t6, 0
@@ -2208,107 +2171,97 @@ define i129 @udiv_i129(i129 %x, i129 %y) nounwind {
; RV64-NEXT: andi a2, a2, 24
; RV64-NEXT: add t1, t1, t3
; RV64-NEXT: add a2, t4, a2
-; RV64-NEXT: ld t3, 0(a2)
-; RV64-NEXT: ld t4, 8(a2)
+; RV64-NEXT: ld t4, 0(a2)
+; RV64-NEXT: ld t3, 8(a2)
; RV64-NEXT: ld a2, 16(a2)
; RV64-NEXT: sll s1, s3, s1
; RV64-NEXT: andi s2, a1, 63
; RV64-NEXT: xori s2, s2, 63
-; RV64-NEXT: or s3, s1, a5
+; RV64-NEXT: or s1, s1, a5
; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: slli a5, t4, 1
+; RV64-NEXT: slli a5, t3, 1
; RV64-NEXT: sll a2, a2, s2
; RV64-NEXT: sll s2, a5, s2
-; RV64-NEXT: srl s1, t4, a1
-; RV64-NEXT: or s1, s1, a2
+; RV64-NEXT: srl a5, t3, a1
+; RV64-NEXT: or t3, a5, a2
; RV64-NEXT: seqz a2, a3
; RV64-NEXT: sub a2, a4, a2
; RV64-NEXT: addi a5, t1, 1
; RV64-NEXT: andi a5, a5, 1
-; RV64-NEXT: andi s3, s3, 1
-; RV64-NEXT: srl t1, t3, a1
-; RV64-NEXT: or s2, t1, s2
+; RV64-NEXT: andi s1, s1, 1
+; RV64-NEXT: srl t1, t4, a1
+; RV64-NEXT: or t4, t1, s2
; RV64-NEXT: addi t1, a3, -1
; RV64-NEXT: j .LBB3_26
; RV64-NEXT: .LBB3_24: # %udiv-do-while
; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
-; RV64-NEXT: sltu t3, a2, s4
+; RV64-NEXT: sltu s1, a2, s2
; RV64-NEXT: .LBB3_25: # %udiv-do-while
; RV64-NEXT: # in Loop: Header=BB3_26 Depth=1
-; RV64-NEXT: srli s1, s1, 63
-; RV64-NEXT: sub t4, a5, s1
-; RV64-NEXT: sub t3, t4, t3
+; RV64-NEXT: srli t3, t3, 63
+; RV64-NEXT: sub t3, a5, t3
+; RV64-NEXT: sub t3, t3, s1
; RV64-NEXT: slli t3, t3, 63
-; RV64-NEXT: srai s1, t3, 63
-; RV64-NEXT: and s3, s1, a4
-; RV64-NEXT: li t3, 0
-; RV64-NEXT: li t4, 0
-; RV64-NEXT: srli s5, a6, 63
-; RV64-NEXT: sub s4, s4, s3
-; RV64-NEXT: slli s3, t0, 1
-; RV64-NEXT: or s3, s3, s5
-; RV64-NEXT: srli t0, t0, 63
-; RV64-NEXT: slli a6, a6, 1
-; RV64-NEXT: or a6, t2, a6
-; RV64-NEXT: seqz t2, a1
-; RV64-NEXT: or s0, s0, t0
-; RV64-NEXT: or s5, a1, t5
-; RV64-NEXT: sub t5, t5, t2
-; RV64-NEXT: and s6, s1, a3
+; RV64-NEXT: srai t3, t3, 63
+; RV64-NEXT: and s1, t3, a4
+; RV64-NEXT: srli s3, t2, 63
+; RV64-NEXT: slli s4, t5, 1
+; RV64-NEXT: srli t5, t5, 63
+; RV64-NEXT: slli t2, t2, 1
+; RV64-NEXT: sub s2, s2, s1
+; RV64-NEXT: and s5, t3, a3
+; RV64-NEXT: or s1, s4, s3
+; RV64-NEXT: seqz s3, a1
+; RV64-NEXT: or t2, a7, t2
+; RV64-NEXT: or s4, a1, t0
; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: andi t2, s1, 1
-; RV64-NEXT: or t0, t6, s3
-; RV64-NEXT: sltu t6, s2, s6
-; RV64-NEXT: snez s5, s5
-; RV64-NEXT: andi s3, s0, 1
-; RV64-NEXT: sub s1, s4, t6
-; RV64-NEXT: add a7, a7, s5
-; RV64-NEXT: addi a7, a7, 1
-; RV64-NEXT: andi a7, a7, 1
-; RV64-NEXT: or t6, a1, t5
-; RV64-NEXT: or s4, t6, a7
-; RV64-NEXT: sub s2, s2, s6
+; RV64-NEXT: or s0, s0, t5
+; RV64-NEXT: andi a7, t3, 1
+; RV64-NEXT: sltu t3, t4, s5
+; RV64-NEXT: sub t0, t0, s3
+; RV64-NEXT: snez s3, s4
+; RV64-NEXT: or t5, t6, s1
+; RV64-NEXT: andi s1, s0, 1
+; RV64-NEXT: sub t3, s2, t3
+; RV64-NEXT: add a6, a6, s3
+; RV64-NEXT: addi a6, a6, 1
+; RV64-NEXT: andi a6, a6, 1
+; RV64-NEXT: or t6, a1, t0
+; RV64-NEXT: or s2, t6, a6
+; RV64-NEXT: sub t4, t4, s5
; RV64-NEXT: li t6, 0
; RV64-NEXT: li s0, 0
-; RV64-NEXT: beqz s4, .LBB3_29
+; RV64-NEXT: beqz s2, .LBB3_28
; RV64-NEXT: .LBB3_26: # %udiv-do-while
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
-; RV64-NEXT: srli t3, s2, 63
-; RV64-NEXT: slli t4, s1, 1
-; RV64-NEXT: slli s2, s2, 1
-; RV64-NEXT: or s4, t4, t3
-; RV64-NEXT: andi t3, s3, 1
-; RV64-NEXT: or s2, s2, t3
-; RV64-NEXT: bne a2, s4, .LBB3_24
+; RV64-NEXT: srli s2, t4, 63
+; RV64-NEXT: slli s3, t3, 1
+; RV64-NEXT: slli t4, t4, 1
+; RV64-NEXT: or s2, s3, s2
+; RV64-NEXT: andi s1, s1, 1
+; RV64-NEXT: or t4, t4, s1
+; RV64-NEXT: bne a2, s2, .LBB3_24
; RV64-NEXT: # %bb.27: # in Loop: Header=BB3_26 Depth=1
-; RV64-NEXT: sltu t3, t1, s2
+; RV64-NEXT: sltu s1, t1, t4
; RV64-NEXT: j .LBB3_25
-; RV64-NEXT: .LBB3_28:
-; RV64-NEXT: li t3, 0
-; RV64-NEXT: li t4, 0
-; RV64-NEXT: .LBB3_29: # %udiv-loop-exit
-; RV64-NEXT: srli a2, a6, 63
-; RV64-NEXT: slli a3, t0, 1
-; RV64-NEXT: srli a4, t0, 63
-; RV64-NEXT: slli a6, a6, 1
-; RV64-NEXT: or a1, t2, a6
-; RV64-NEXT: or a2, t3, a2
-; RV64-NEXT: or a4, t4, a4
-; RV64-NEXT: or t0, a2, a3
-; RV64-NEXT: andi t2, a4, 1
-; RV64-NEXT: .LBB3_30: # %udiv-end
-; RV64-NEXT: andi a2, t2, 1
-; RV64-NEXT: sd a1, 0(a0)
-; RV64-NEXT: sd t0, 8(a0)
-; RV64-NEXT: sb a2, 16(a0)
-; RV64-NEXT: ld s0, 184(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 176(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 168(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 160(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 152(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s5, 144(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s6, 136(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 192
+; RV64-NEXT: .LBB3_28: # %udiv-loop-exit
+; RV64-NEXT: srli a2, t2, 63
+; RV64-NEXT: slli a3, t5, 1
+; RV64-NEXT: srli a1, t5, 63
+; RV64-NEXT: slli a4, t2, 1
+; RV64-NEXT: or t2, a3, a2
+; RV64-NEXT: or t5, a7, a4
+; RV64-NEXT: .LBB3_29: # %udiv-end
+; RV64-NEXT: sd t5, 0(a0)
+; RV64-NEXT: sd t2, 8(a0)
+; RV64-NEXT: sb a1, 16(a0)
+; RV64-NEXT: ld s0, 168(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 160(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 152(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 144(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 136(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 128(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 176
; RV64-NEXT: ret
%res = udiv i129 %x, %y
ret i129 %res
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index ff236c72922f4..ce7741d31d93f 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -105,8 +105,8 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB1_3:
-; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a0, a1, 1
@@ -252,14 +252,13 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_3:
; RV32I-NEXT: li a0, 64
-; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: srli s1, s1, 27
; RV32I-NEXT: add s1, s3, s1
; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: .LBB3_5: # %cond.false
+; RV32I-NEXT: .LBB3_5: # %cond.end
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -283,8 +282,8 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32XTHEADBB-NOB-NEXT: li a1, 64
; RV32XTHEADBB-NOB-NEXT: j .LBB3_5
; RV32XTHEADBB-NOB-NEXT: .LBB3_3:
-; RV32XTHEADBB-NOB-NEXT: li a1, 0
; RV32XTHEADBB-NOB-NEXT: li a0, 64
+; RV32XTHEADBB-NOB-NEXT: li a1, 0
; RV32XTHEADBB-NOB-NEXT: ret
; RV32XTHEADBB-NOB-NEXT: .LBB3_4:
; RV32XTHEADBB-NOB-NEXT: addi a1, a0, -1
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index dad71ee5de066..b6cfa3c741209 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -98,8 +98,8 @@ define i64 @ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB1_3:
-; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB1_4:
; RV32I-NEXT: srli a0, a1, 1
@@ -217,14 +217,13 @@ define i64 @cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_3:
; RV32I-NEXT: li a0, 64
-; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: srli s1, s1, 27
; RV32I-NEXT: add s1, s3, s1
; RV32I-NEXT: lbu a0, 0(s1)
-; RV32I-NEXT: .LBB3_5: # %cond.false
+; RV32I-NEXT: .LBB3_5: # %cond.end
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
index 5b5d3d856d878..5ec403777cedd 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
@@ -175,16 +175,14 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
; RV32I-NEXT: addi a0, a0, 32
; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_3:
-; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a0, 64
-; RV32I-NEXT: j .LBB3_6
+; RV32I-NEXT: j .LBB3_5
; RV32I-NEXT: .LBB3_4:
; RV32I-NEXT: srli s0, s0, 27
; RV32I-NEXT: add s0, s4, s0
; RV32I-NEXT: lbu a0, 0(s0)
-; RV32I-NEXT: .LBB3_5: # %cond.false
+; RV32I-NEXT: .LBB3_5: # %cond.end
; RV32I-NEXT: li a1, 0
-; RV32I-NEXT: .LBB3_6: # %cond.end
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -626,8 +624,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB11_3:
-; RV32I-NEXT: li a1, 0
; RV32I-NEXT: li a0, 64
+; RV32I-NEXT: li a1, 0
; RV32I-NEXT: ret
; RV32I-NEXT: .LBB11_4:
; RV32I-NEXT: srli a4, a3, 1
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 143e10e6909e4..f780f0172647f 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -182,7 +182,6 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl %edx, %edx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: je .LBB6_1
@@ -192,24 +191,22 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
; X86-NEXT: # %bb.4: # %cond.false
; X86-NEXT: rep bsfl %ecx, %eax
; X86-NEXT: addl $32, %eax
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: je .LBB6_6
-; X86-NEXT: jmp .LBB6_7
+; X86-NEXT: jmp .LBB6_5
; X86-NEXT: .LBB6_1:
; X86-NEXT: movl $64, %eax
+; X86-NEXT: jmp .LBB6_5
+; X86-NEXT: .LBB6_3:
+; X86-NEXT: rep bsfl %esi, %eax
+; X86-NEXT: .LBB6_5: # %cond.end
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: orl %ecx, %esi
; X86-NEXT: jne .LBB6_7
-; X86-NEXT: .LBB6_6: # %cond.end
+; X86-NEXT: # %bb.6: # %cond.end
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB6_7: # %cond.end
; X86-NEXT: popl %esi
; X86-NEXT: retl
-; X86-NEXT: .LBB6_3:
-; X86-NEXT: rep bsfl %esi, %eax
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: je .LBB6_6
-; X86-NEXT: jmp .LBB6_7
;
; X64-LABEL: cmov_bsf64:
; X64: # %bb.0:
@@ -269,71 +266,66 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: movl 36(%ebp), %ebx
-; X86-NEXT: movl 28(%ebp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl 36(%ebp), %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: je .LBB8_1
; X86-NEXT: # %bb.2: # %cond.false
-; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: testl %edx, %edx
; X86-NEXT: jne .LBB8_3
; X86-NEXT: # %bb.4: # %cond.false
-; X86-NEXT: rep bsfl %esi, %eax
-; X86-NEXT: addl $32, %eax
-; X86-NEXT: jmp .LBB8_5
+; X86-NEXT: rep bsfl %edi, %ebx
+; X86-NEXT: addl $32, %ebx
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: je .LBB8_7
+; X86-NEXT: .LBB8_6:
+; X86-NEXT: rep bsfl %ecx, %esi
+; X86-NEXT: jmp .LBB8_8
; X86-NEXT: .LBB8_1:
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $128, %ebx
; X86-NEXT: jmp .LBB8_11
; X86-NEXT: .LBB8_3:
-; X86-NEXT: rep bsfl %ecx, %eax
-; X86-NEXT: .LBB8_5: # %cond.false
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: rep bsfl %edx, %ebx
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: jne .LBB8_6
-; X86-NEXT: # %bb.7: # %cond.false
-; X86-NEXT: rep bsfl %ebx, %edx
-; X86-NEXT: addl $32, %edx
-; X86-NEXT: jmp .LBB8_8
-; X86-NEXT: .LBB8_6:
-; X86-NEXT: rep bsfl %edi, %edx
+; X86-NEXT: .LBB8_7: # %cond.false
+; X86-NEXT: rep bsfl 36(%ebp), %esi
+; X86-NEXT: addl $32, %esi
; X86-NEXT: .LBB8_8: # %cond.false
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: orl %edi, %ecx
; X86-NEXT: jne .LBB8_10
; X86-NEXT: # %bb.9: # %cond.false
-; X86-NEXT: addl $64, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $64, %esi
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: .LBB8_10: # %cond.false
-; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: movl 32(%ebp), %ecx
; X86-NEXT: .LBB8_11: # %cond.end
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: orl 32(%ebp), %ecx
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl 36(%ebp), %edi
+; X86-NEXT: orl %edx, %edi
; X86-NEXT: je .LBB8_12
; X86-NEXT: # %bb.13: # %cond.end
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: xorl %esi, %esi
; X86-NEXT: jmp .LBB8_14
; X86-NEXT: .LBB8_12:
-; X86-NEXT: movl 52(%ebp), %ebx
+; X86-NEXT: movl 52(%ebp), %esi
; X86-NEXT: movl 48(%ebp), %edx
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl 40(%ebp), %ebx
; X86-NEXT: .LBB8_14: # %cond.end
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %ebx, (%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index ab0478a4e944b..affacc5ee6487 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -200,40 +200,37 @@ define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind {
define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind {
; X86-LABEL: cmov_bsr64:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: je .LBB6_1
; X86-NEXT: # %bb.2: # %cond.false
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: jne .LBB6_3
; X86-NEXT: # %bb.4: # %cond.false
-; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: bsrl %edx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: orl $32, %eax
-; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: je .LBB6_7
; X86-NEXT: jmp .LBB6_6
; X86-NEXT: .LBB6_1:
; X86-NEXT: movl $64, %eax
-; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: jne .LBB6_6
; X86-NEXT: .LBB6_7: # %cond.end
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: popl %esi
; X86-NEXT: retl
; X86-NEXT: .LBB6_3:
; X86-NEXT: bsrl %ecx, %eax
; X86-NEXT: xorl $31, %eax
-; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: je .LBB6_7
; X86-NEXT: .LBB6_6:
; X86-NEXT: xorl $63, %eax
-; X86-NEXT: popl %esi
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: cmov_bsr64:
@@ -311,57 +308,59 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
; X86-NEXT: testl %esi, %esi
; X86-NEXT: jne .LBB8_3
; X86-NEXT: # %bb.4: # %cond.false
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: bsrl %ebx, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: orl $32, %esi
; X86-NEXT: testl %edi, %edi
; X86-NEXT: je .LBB8_7
; X86-NEXT: .LBB8_6:
-; X86-NEXT: bsrl %edi, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: jmp .LBB8_8
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: je .LBB8_9
+; X86-NEXT: jmp .LBB8_10
; X86-NEXT: .LBB8_1:
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl $128, %esi
; X86-NEXT: jmp .LBB8_11
; X86-NEXT: .LBB8_3:
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: bsrl %esi, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: testl %edi, %edi
; X86-NEXT: jne .LBB8_6
; X86-NEXT: .LBB8_7: # %cond.false
-; X86-NEXT: bsrl %ecx, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: orl $32, %eax
-; X86-NEXT: .LBB8_8: # %cond.false
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: orl 36(%ebp), %edx
+; X86-NEXT: bsrl %ecx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: orl %eax, %ebx
; X86-NEXT: jne .LBB8_10
-; X86-NEXT: # %bb.9: # %cond.false
-; X86-NEXT: orl $64, %eax
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: .LBB8_9: # %cond.false
+; X86-NEXT: orl $64, %edx
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: .LBB8_10: # %cond.false
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: movl 32(%ebp), %ebx
; X86-NEXT: .LBB8_11: # %cond.end
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: orl 32(%ebp), %ecx
-; X86-NEXT: orl 36(%ebp), %edi
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl %eax, %edi
; X86-NEXT: orl %ecx, %edi
; X86-NEXT: je .LBB8_12
; X86-NEXT: # %bb.13: # %cond.end
; X86-NEXT: xorl $127, %esi
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: xorl %edi, %edi
; X86-NEXT: jmp .LBB8_14
; X86-NEXT: .LBB8_12:
-; X86-NEXT: movl 52(%ebp), %edx
-; X86-NEXT: movl 48(%ebp), %ebx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: movl 48(%ebp), %edx
; X86-NEXT: movl 44(%ebp), %ecx
; X86-NEXT: movl 40(%ebp), %esi
; X86-NEXT: .LBB8_14: # %cond.end
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 12(%eax)
-; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
; X86-NEXT: movl %ecx, 4(%eax)
; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: leal -12(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 752f6659948e6..c5aa2a9f40239 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -288,14 +288,12 @@ define i32 @ctlo_i32_undef(i32 %x) {
define i64 @ctlo_i64(i64 %x) nounwind {
; X86-NOCMOV-LABEL: ctlo_i64:
; X86-NOCMOV: # %bb.0:
-; X86-NOCMOV-NEXT: pushl %esi
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOCMOV-NEXT: notl %ecx
; X86-NOCMOV-NEXT: notl %eax
-; X86-NOCMOV-NEXT: xorl %edx, %edx
-; X86-NOCMOV-NEXT: movl %eax, %esi
-; X86-NOCMOV-NEXT: orl %ecx, %esi
+; X86-NOCMOV-NEXT: movl %eax, %edx
+; X86-NOCMOV-NEXT: orl %ecx, %edx
; X86-NOCMOV-NEXT: je .LBB6_1
; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
; X86-NOCMOV-NEXT: testl %ecx, %ecx
@@ -304,16 +302,16 @@ define i64 @ctlo_i64(i64 %x) nounwind {
; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
; X86-NOCMOV-NEXT: orl $32, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB6_1:
; X86-NOCMOV-NEXT: movl $64, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB6_3:
; X86-NOCMOV-NEXT: bsrl %ecx, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: ctlo_i64:
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 1267fe9033454..789c681174efb 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -402,12 +402,10 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
define i64 @ctlz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-LABEL: ctlz_i64_zero_test:
; X86-NOCMOV: # %bb.0:
-; X86-NOCMOV-NEXT: pushl %esi
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOCMOV-NEXT: xorl %edx, %edx
-; X86-NOCMOV-NEXT: movl %ecx, %esi
-; X86-NOCMOV-NEXT: orl %eax, %esi
+; X86-NOCMOV-NEXT: movl %ecx, %edx
+; X86-NOCMOV-NEXT: orl %eax, %edx
; X86-NOCMOV-NEXT: je .LBB7_1
; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
; X86-NOCMOV-NEXT: testl %eax, %eax
@@ -416,16 +414,16 @@ define i64 @ctlz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-NEXT: bsrl %ecx, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
; X86-NOCMOV-NEXT: orl $32, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_1:
; X86-NOCMOV-NEXT: movl $64, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_3:
; X86-NOCMOV-NEXT: bsrl %eax, %eax
; X86-NOCMOV-NEXT: xorl $31, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: ctlz_i64_zero_test:
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index c95b7bd7f131a..a88fb96dd7c8c 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -355,12 +355,10 @@ define i32 @cttz_i32_zero_test(i32 %n) {
define i64 @cttz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-LABEL: cttz_i64_zero_test:
; X86-NOCMOV: # %bb.0:
-; X86-NOCMOV-NEXT: pushl %esi
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOCMOV-NEXT: xorl %edx, %edx
-; X86-NOCMOV-NEXT: movl %ecx, %esi
-; X86-NOCMOV-NEXT: orl %eax, %esi
+; X86-NOCMOV-NEXT: movl %ecx, %edx
+; X86-NOCMOV-NEXT: orl %eax, %edx
; X86-NOCMOV-NEXT: je .LBB7_1
; X86-NOCMOV-NEXT: # %bb.2: # %cond.false
; X86-NOCMOV-NEXT: testl %ecx, %ecx
@@ -368,26 +366,24 @@ define i64 @cttz_i64_zero_test(i64 %n) nounwind {
; X86-NOCMOV-NEXT: # %bb.4: # %cond.false
; X86-NOCMOV-NEXT: rep bsfl %eax, %eax
; X86-NOCMOV-NEXT: addl $32, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_1:
; X86-NOCMOV-NEXT: movl $64, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
; X86-NOCMOV-NEXT: .LBB7_3:
; X86-NOCMOV-NEXT: rep bsfl %ecx, %eax
-; X86-NOCMOV-NEXT: popl %esi
+; X86-NOCMOV-NEXT: xorl %edx, %edx
; X86-NOCMOV-NEXT: retl
;
; X86-CMOV-LABEL: cttz_i64_zero_test:
; X86-CMOV: # %bb.0:
; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx
; X86-CMOV-NEXT: movl $32, %edx
; X86-CMOV-NEXT: cmovnel %ecx, %edx
; X86-CMOV-NEXT: addl $32, %edx
-; X86-CMOV-NOT: rep
; X86-CMOV-NEXT: bsfl %eax, %eax
; X86-CMOV-NEXT: cmovel %edx, %eax
; X86-CMOV-NEXT: xorl %edx, %edx
@@ -594,13 +590,11 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) {
define i32 @cttz_i32_osize(i32 %x) optsize {
; X86-LABEL: cttz_i32_osize:
; X86: # %bb.0:
-; X86-NOT: rep
; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i32_osize:
; X64: # %bb.0:
-; X64-NOT: rep
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
;
@@ -630,13 +624,11 @@ define i32 @cttz_i32_osize(i32 %x) optsize {
define i32 @cttz_i32_msize(i32 %x) minsize {
; X86-LABEL: cttz_i32_msize:
; X86: # %bb.0:
-; X86-NOT: rep
; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: cttz_i32_msize:
; X64: # %bb.0:
-; X64-NOT: rep
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 455b72d16a075..1a696d546a1a3 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,14 +152,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $176, %esp
-; X86-NEXT: movl 32(%ebp), %edx
-; X86-NEXT: movl 36(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: movl %ecx, %edi
; X86-NEXT: xorl %eax, %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %esi
; X86-NEXT: movl 28(%ebp), %edx
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl 24(%ebp), %ecx
@@ -172,26 +172,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebp), %esi
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %edi
; X86-NEXT: movl 48(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
; X86-NEXT: movl 44(%ebp), %ebx
; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl 40(%ebp), %edi
-; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: subl %edx, %esi
; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
@@ -203,92 +203,89 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: bsrl %edi, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: orl $32, %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: cmovnel %edx, %ebx
+; X86-NEXT: orl $64, %ebx
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: cmovnel %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: bsrl %eax, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: bsrl %edi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %esi
; X86-NEXT: xorl $31, %esi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %eax, %eax
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: subl %edx, %edi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $127, %ecx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, %ecx
+; X86-NEXT: cmpl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: jne .LBB4_1
-; X86-NEXT: # %bb.8: # %_udiv-special-cases
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: cmovnel %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: cmovnel %edi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: xorl $127, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: cmovnel %edi, %eax
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_8
+; X86-NEXT: # %bb.1: # %_udiv-special-cases
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl $127, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: je .LBB4_9
-; X86-NEXT: # %bb.5: # %udiv-bb1
+; X86-NEXT: # %bb.2: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: xorps %xmm0, %xmm0
@@ -297,264 +294,252 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 152(%esp,%eax), %esi
-; X86-NEXT: movl 156(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esp,%eax), %edx
+; X86-NEXT: movl 152(%esp,%eax), %ebx
+; X86-NEXT: movl 156(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%eax), %esi
; X86-NEXT: movl 148(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: jae .LBB4_2
-; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: jae .LBB4_5
+; X86-NEXT: # %bb.3:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: jmp .LBB4_7
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: jmp .LBB4_9
-; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: .LBB4_5: # %udiv-preheader
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 108(%esp,%eax), %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 108(%esp,%eax), %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 104(%esp,%eax), %ebx
; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: shrdl %cl, %edi, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%eax), %esi
-; X86-NEXT: movl 100(%esp,%eax), %eax
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: shrdl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl 96(%esp,%eax), %edx
+; X86-NEXT: movl 100(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: .p2align 4
-; X86-NEXT: .LBB4_3: # %udiv-do-while
+; X86-NEXT: .LBB4_6: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shldl $1, %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: shldl $1, %ecx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %edi, %edi
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $-1, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: jne .LBB4_3
-; X86-NEXT: # %bb.4:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB4_6
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebx, %esi
-; X86-NEXT: orl %edx, %esi
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: .LBB4_9: # %udiv-end
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: leal (%edi,%esi,2), %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB4_8: # %udiv-end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: .LBB4_9: # %udiv-end
; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: xorl %ecx, %ebx
; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: subl %ecx, %edi
; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %ecx, %ebx
; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 56(%ebp), %ecx
-; X86-NEXT: movl %edx, (%ecx)
+; X86-NEXT: movl %edi, (%ecx)
; X86-NEXT: movl %eax, 4(%ecx)
; X86-NEXT: movl %ebx, 8(%ecx)
; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebp), %ecx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull 40(%ebp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %edi
; X86-NEXT: movl 44(%ebp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull 44(%ebp)
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %eax, %ebx
-; X86-NEXT: mull %edi
+; X86-NEXT: imull %eax, %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: imull %esi, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: imull 44(%ebp), %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: addl %edi, %ebx
; X86-NEXT: movl 48(%ebp), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 52(%ebp), %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull %esi, %ecx
+; X86-NEXT: movl 52(%ebp), %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %ebx
+; X86-NEXT: imull %edx, %esi
; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: addl %ecx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl 24(%ebp), %edx
; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl 28(%ebp), %ecx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl 32(%ebp), %edi
; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 36(%ebp), %esi
-; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: sbbl %esi, %ebx
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %edx, (%eax)
; X86-NEXT: movl %ecx, 4(%eax)
; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 859e9244d29d2..7f5ede7a858d2 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,49 +152,48 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
-; X86-NEXT: movl 40(%ebp), %ebx
-; X86-NEXT: movl 52(%ebp), %esi
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: orl 48(%ebp), %ecx
+; X86-NEXT: movl 48(%ebp), %esi
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: sete %cl
+; X86-NEXT: sete %bl
; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: orl 36(%ebp), %eax
; X86-NEXT: movl 24(%ebp), %edx
; X86-NEXT: orl 32(%ebp), %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
-; X86-NEXT: orb %cl, %al
+; X86-NEXT: orb %bl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl 48(%ebp), %ecx
+; X86-NEXT: bsrl %esi, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: bsrl %eax, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: bsrl %ebx, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: orl $32, %eax
-; X86-NEXT: testl %edi, %edi
-; X86-NEXT: cmovnel %edx, %eax
-; X86-NEXT: orl $64, %eax
-; X86-NEXT: movl 48(%ebp), %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %eax
-; X86-NEXT: movl 36(%ebp), %ebx
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl 40(%ebp), %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: cmovnel %edx, %ebx
+; X86-NEXT: orl $64, %ebx
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: cmovnel %ecx, %ebx
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl 32(%ebp), %ecx
-; X86-NEXT: bsrl %ecx, %ecx
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: bsrl %eax, %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %ecx
; X86-NEXT: movl 28(%ebp), %edi
; X86-NEXT: bsrl %edi, %esi
@@ -205,23 +204,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: orl %edi, %esi
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %ecx, %ecx
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $127, %edx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %eax, %edx
-; X86-NEXT: movl $0, %edx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: cmpl %ebx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edx
; X86-NEXT: movl $0, %edx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %ecx, %edx
@@ -230,77 +229,82 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sbbl %esi, %edx
; X86-NEXT: setb %dl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: cmovnel %edi, %eax
-; X86-NEXT: movl 32(%ebp), %esi
-; X86-NEXT: cmovnel %edi, %esi
-; X86-NEXT: movl 28(%ebp), %edx
-; X86-NEXT: cmovnel %edi, %edx
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: cmovnel %edi, %ebx
-; X86-NEXT: movl 56(%ebp), %edi
-; X86-NEXT: jne .LBB4_8
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: cmovnel %eax, %edx
+; X86-NEXT: movl 32(%ebp), %ebx
+; X86-NEXT: cmovnel %eax, %ebx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: cmovnel %eax, %edi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %esi
+; X86-NEXT: jne .LBB4_6
; X86-NEXT: # %bb.1: # %_udiv-special-cases
-; X86-NEXT: movl %eax, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: xorl $127, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl 56(%ebp), %edi
-; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: je .LBB4_8
+; X86-NEXT: je .LBB4_6
; X86-NEXT: # %bb.2: # %udiv-bb1
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 32(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl 36(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 136(%esp,%eax), %esi
-; X86-NEXT: movl 140(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %esi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%esp,%eax), %ebx
-; X86-NEXT: movl 132(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $0, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: jae .LBB4_3
-; X86-NEXT: # %bb.6:
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: # %bb.7:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: jmp .LBB4_7
+; X86-NEXT: jmp .LBB4_5
; X86-NEXT: .LBB4_3: # %udiv-preheader
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 28(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 36(%ebp), %edi
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -309,23 +313,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movl 92(%esp,%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esp,%eax), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esp,%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %esi
+; X86-NEXT: movl 84(%esp,%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrl %cl, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esp,%eax), %edx
-; X86-NEXT: movl 84(%esp,%eax), %eax
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edi, %ebx
-; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 40(%ebp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -339,141 +343,141 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB4_4: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: sarl $31, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl 52(%ebp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 52(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl 48(%ebp), %eax
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl 44(%ebp), %edx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl 44(%ebp), %ebx
; X86-NEXT: andl 40(%ebp), %ecx
-; X86-NEXT: subl %ecx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_4
-; X86-NEXT: # %bb.5:
+; X86-NEXT: .LBB4_5: # %udiv-loop-exit
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl 56(%ebp), %edi
-; X86-NEXT: .LBB4_7: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: shldl $1, %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: addl %ebx, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: .LBB4_8: # %udiv-end
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, (%edi)
-; X86-NEXT: movl %edx, 4(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: leal (%eax,%esi,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl 56(%ebp), %esi
+; X86-NEXT: .LBB4_6: # %udiv-end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %edi, 4(%esi)
+; X86-NEXT: movl %ebx, 8(%esi)
+; X86-NEXT: movl %edx, 12(%esi)
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl 48(%ebp), %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %edx, %esi
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl 52(%ebp), %edi
-; X86-NEXT: imull %ebx, %edi
-; X86-NEXT: addl %edx, %edi
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 52(%ebp), %eax
+; X86-NEXT: imull %edi, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull 40(%ebp), %ecx
+; X86-NEXT: imull %edi, %ecx
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl 44(%ebp), %eax
-; X86-NEXT: imull %eax, %ebx
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: imull %esi, %ebx
; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 40(%ebp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %edi, %ecx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull 44(%ebp)
-; X86-NEXT: movl 28(%ebp), %esi
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull 44(%ebp)
@@ -481,19 +485,20 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 36(%ebp), %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 426587a84ce17..28029793211f0 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -723,237 +723,217 @@ define i64 @test4(i64 %a, i64 %b) nounwind {
define i256 @PR25498(i256 %a) nounwind {
; ILP-LABEL: PR25498:
; ILP: # %bb.0:
-; ILP-NEXT: pushq %rbx
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: xorl %edi, %edi
-; ILP-NEXT: movq %rsi, %rbx
-; ILP-NEXT: negq %rbx
-; ILP-NEXT: movl $0, %r11d
-; ILP-NEXT: sbbq %rdx, %r11
+; ILP-NEXT: movq %rsi, %r11
+; ILP-NEXT: negq %r11
+; ILP-NEXT: movl $0, %r10d
+; ILP-NEXT: sbbq %rdx, %r10
; ILP-NEXT: movl $0, %r9d
; ILP-NEXT: sbbq %rcx, %r9
-; ILP-NEXT: movl $0, %r10d
-; ILP-NEXT: sbbq %r8, %r10
+; ILP-NEXT: sbbq %r8, %rdi
; ILP-NEXT: orq %r8, %rdx
; ILP-NEXT: orq %rcx, %rsi
; ILP-NEXT: orq %rdx, %rsi
; ILP-NEXT: je .LBB4_1
; ILP-NEXT: # %bb.2: # %cond.false
-; ILP-NEXT: bsrq %r11, %rdx
-; ILP-NEXT: bsrq %r10, %rcx
+; ILP-NEXT: bsrq %r10, %rdx
+; ILP-NEXT: bsrq %rdi, %rcx
; ILP-NEXT: xorq $63, %rcx
; ILP-NEXT: bsrq %r9, %rsi
; ILP-NEXT: xorq $63, %rsi
; ILP-NEXT: orq $64, %rsi
-; ILP-NEXT: testq %r10, %r10
+; ILP-NEXT: testq %rdi, %rdi
; ILP-NEXT: cmovneq %rcx, %rsi
; ILP-NEXT: xorq $63, %rdx
-; ILP-NEXT: bsrq %rbx, %rcx
+; ILP-NEXT: bsrq %r11, %rcx
; ILP-NEXT: xorq $63, %rcx
; ILP-NEXT: orq $64, %rcx
-; ILP-NEXT: testq %r11, %r11
+; ILP-NEXT: testq %r10, %r10
; ILP-NEXT: cmovneq %rdx, %rcx
; ILP-NEXT: orq $128, %rcx
-; ILP-NEXT: xorl %edi, %edi
-; ILP-NEXT: orq %r10, %r9
+; ILP-NEXT: orq %rdi, %r9
; ILP-NEXT: cmovneq %rsi, %rcx
; ILP-NEXT: jmp .LBB4_3
; ILP-NEXT: .LBB4_1:
; ILP-NEXT: movl $256, %ecx # imm = 0x100
; ILP-NEXT: .LBB4_3: # %cond.end
+; ILP-NEXT: xorps %xmm0, %xmm0
+; ILP-NEXT: movaps %xmm0, 16(%rax)
; ILP-NEXT: movq %rcx, (%rax)
-; ILP-NEXT: movq %rdi, 8(%rax)
-; ILP-NEXT: movq %rdi, 16(%rax)
-; ILP-NEXT: movq %rdi, 24(%rax)
-; ILP-NEXT: popq %rbx
+; ILP-NEXT: movq $0, 8(%rax)
; ILP-NEXT: retq
;
; HYBRID-LABEL: PR25498:
; HYBRID: # %bb.0:
-; HYBRID-NEXT: pushq %rbx
; HYBRID-NEXT: movq %rdi, %rax
; HYBRID-NEXT: xorl %edi, %edi
-; HYBRID-NEXT: movq %rsi, %rbx
-; HYBRID-NEXT: negq %rbx
-; HYBRID-NEXT: movl $0, %r11d
-; HYBRID-NEXT: sbbq %rdx, %r11
+; HYBRID-NEXT: movq %rsi, %r11
+; HYBRID-NEXT: negq %r11
+; HYBRID-NEXT: movl $0, %r10d
+; HYBRID-NEXT: sbbq %rdx, %r10
; HYBRID-NEXT: movl $0, %r9d
; HYBRID-NEXT: sbbq %rcx, %r9
-; HYBRID-NEXT: movl $0, %r10d
-; HYBRID-NEXT: sbbq %r8, %r10
+; HYBRID-NEXT: sbbq %r8, %rdi
; HYBRID-NEXT: orq %r8, %rdx
; HYBRID-NEXT: orq %rcx, %rsi
; HYBRID-NEXT: orq %rdx, %rsi
; HYBRID-NEXT: je .LBB4_1
; HYBRID-NEXT: # %bb.2: # %cond.false
-; HYBRID-NEXT: bsrq %r10, %rcx
+; HYBRID-NEXT: bsrq %rdi, %rcx
; HYBRID-NEXT: xorq $63, %rcx
; HYBRID-NEXT: bsrq %r9, %rdx
; HYBRID-NEXT: xorq $63, %rdx
; HYBRID-NEXT: orq $64, %rdx
-; HYBRID-NEXT: testq %r10, %r10
+; HYBRID-NEXT: testq %rdi, %rdi
; HYBRID-NEXT: cmovneq %rcx, %rdx
-; HYBRID-NEXT: bsrq %r11, %rsi
+; HYBRID-NEXT: bsrq %r10, %rsi
; HYBRID-NEXT: xorq $63, %rsi
-; HYBRID-NEXT: bsrq %rbx, %rcx
+; HYBRID-NEXT: bsrq %r11, %rcx
; HYBRID-NEXT: xorq $63, %rcx
; HYBRID-NEXT: orq $64, %rcx
-; HYBRID-NEXT: testq %r11, %r11
+; HYBRID-NEXT: testq %r10, %r10
; HYBRID-NEXT: cmovneq %rsi, %rcx
; HYBRID-NEXT: orq $128, %rcx
-; HYBRID-NEXT: orq %r10, %r9
+; HYBRID-NEXT: orq %rdi, %r9
; HYBRID-NEXT: cmovneq %rdx, %rcx
-; HYBRID-NEXT: xorl %edi, %edi
; HYBRID-NEXT: jmp .LBB4_3
; HYBRID-NEXT: .LBB4_1:
; HYBRID-NEXT: movl $256, %ecx # imm = 0x100
; HYBRID-NEXT: .LBB4_3: # %cond.end
+; HYBRID-NEXT: xorps %xmm0, %xmm0
+; HYBRID-NEXT: movaps %xmm0, 16(%rax)
; HYBRID-NEXT: movq %rcx, (%rax)
-; HYBRID-NEXT: movq %rdi, 8(%rax)
-; HYBRID-NEXT: movq %rdi, 16(%rax)
-; HYBRID-NEXT: movq %rdi, 24(%rax)
-; HYBRID-NEXT: popq %rbx
+; HYBRID-NEXT: movq $0, 8(%rax)
; HYBRID-NEXT: retq
;
; BURR-LABEL: PR25498:
; BURR: # %bb.0:
-; BURR-NEXT: pushq %rbx
; BURR-NEXT: movq %rdi, %rax
; BURR-NEXT: xorl %edi, %edi
-; BURR-NEXT: movq %rsi, %rbx
-; BURR-NEXT: negq %rbx
-; BURR-NEXT: movl $0, %r11d
-; BURR-NEXT: sbbq %rdx, %r11
+; BURR-NEXT: movq %rsi, %r11
+; BURR-NEXT: negq %r11
+; BURR-NEXT: movl $0, %r10d
+; BURR-NEXT: sbbq %rdx, %r10
; BURR-NEXT: movl $0, %r9d
; BURR-NEXT: sbbq %rcx, %r9
-; BURR-NEXT: movl $0, %r10d
-; BURR-NEXT: sbbq %r8, %r10
+; BURR-NEXT: sbbq %r8, %rdi
; BURR-NEXT: orq %r8, %rdx
; BURR-NEXT: orq %rcx, %rsi
; BURR-NEXT: orq %rdx, %rsi
; BURR-NEXT: je .LBB4_1
; BURR-NEXT: # %bb.2: # %cond.false
-; BURR-NEXT: bsrq %r10, %rcx
+; BURR-NEXT: bsrq %rdi, %rcx
; BURR-NEXT: xorq $63, %rcx
; BURR-NEXT: bsrq %r9, %rdx
; BURR-NEXT: xorq $63, %rdx
; BURR-NEXT: orq $64, %rdx
-; BURR-NEXT: testq %r10, %r10
+; BURR-NEXT: testq %rdi, %rdi
; BURR-NEXT: cmovneq %rcx, %rdx
-; BURR-NEXT: bsrq %r11, %rsi
+; BURR-NEXT: bsrq %r10, %rsi
; BURR-NEXT: xorq $63, %rsi
-; BURR-NEXT: bsrq %rbx, %rcx
+; BURR-NEXT: bsrq %r11, %rcx
; BURR-NEXT: xorq $63, %rcx
; BURR-NEXT: orq $64, %rcx
-; BURR-NEXT: testq %r11, %r11
+; BURR-NEXT: testq %r10, %r10
; BURR-NEXT: cmovneq %rsi, %rcx
; BURR-NEXT: orq $128, %rcx
-; BURR-NEXT: orq %r10, %r9
+; BURR-NEXT: orq %rdi, %r9
; BURR-NEXT: cmovneq %rdx, %rcx
-; BURR-NEXT: xorl %edi, %edi
; BURR-NEXT: jmp .LBB4_3
; BURR-NEXT: .LBB4_1:
; BURR-NEXT: movl $256, %ecx # imm = 0x100
; BURR-NEXT: .LBB4_3: # %cond.end
; BURR-NEXT: movq %rcx, (%rax)
-; BURR-NEXT: movq %rdi, 8(%rax)
-; BURR-NEXT: movq %rdi, 16(%rax)
-; BURR-NEXT: movq %rdi, 24(%rax)
-; BURR-NEXT: popq %rbx
+; BURR-NEXT: xorps %xmm0, %xmm0
+; BURR-NEXT: movaps %xmm0, 16(%rax)
+; BURR-NEXT: movq $0, 8(%rax)
; BURR-NEXT: retq
;
; SRC-LABEL: PR25498:
; SRC: # %bb.0:
-; SRC-NEXT: pushq %rbx
; SRC-NEXT: movq %rdi, %rax
; SRC-NEXT: xorl %edi, %edi
-; SRC-NEXT: movq %rsi, %rbx
-; SRC-NEXT: negq %rbx
-; SRC-NEXT: movl $0, %r11d
-; SRC-NEXT: sbbq %rdx, %r11
+; SRC-NEXT: movq %rsi, %r11
+; SRC-NEXT: negq %r11
+; SRC-NEXT: movl $0, %r10d
+; SRC-NEXT: sbbq %rdx, %r10
; SRC-NEXT: movl $0, %r9d
; SRC-NEXT: sbbq %rcx, %r9
-; SRC-NEXT: movl $0, %r10d
-; SRC-NEXT: sbbq %r8, %r10
+; SRC-NEXT: sbbq %r8, %rdi
; SRC-NEXT: orq %r8, %rdx
; SRC-NEXT: orq %rcx, %rsi
; SRC-NEXT: orq %rdx, %rsi
; SRC-NEXT: je .LBB4_1
; SRC-NEXT: # %bb.2: # %cond.false
-; SRC-NEXT: bsrq %r10, %rcx
+; SRC-NEXT: bsrq %rdi, %rcx
; SRC-NEXT: xorq $63, %rcx
; SRC-NEXT: bsrq %r9, %rdx
; SRC-NEXT: xorq $63, %rdx
; SRC-NEXT: orq $64, %rdx
-; SRC-NEXT: testq %r10, %r10
+; SRC-NEXT: testq %rdi, %rdi
; SRC-NEXT: cmovneq %rcx, %rdx
-; SRC-NEXT: bsrq %r11, %rsi
+; SRC-NEXT: bsrq %r10, %rsi
; SRC-NEXT: xorq $63, %rsi
-; SRC-NEXT: bsrq %rbx, %rcx
+; SRC-NEXT: bsrq %r11, %rcx
; SRC-NEXT: xorq $63, %rcx
; SRC-NEXT: orq $64, %rcx
-; SRC-NEXT: testq %r11, %r11
+; SRC-NEXT: testq %r10, %r10
; SRC-NEXT: cmovneq %rsi, %rcx
; SRC-NEXT: orq $128, %rcx
-; SRC-NEXT: orq %r10, %r9
+; SRC-NEXT: orq %rdi, %r9
; SRC-NEXT: cmovneq %rdx, %rcx
-; SRC-NEXT: xorl %edi, %edi
; SRC-NEXT: jmp .LBB4_3
; SRC-NEXT: .LBB4_1:
; SRC-NEXT: movl $256, %ecx # imm = 0x100
; SRC-NEXT: .LBB4_3: # %cond.end
; SRC-NEXT: movq %rcx, (%rax)
-; SRC-NEXT: movq %rdi, 8(%rax)
-; SRC-NEXT: movq %rdi, 16(%rax)
-; SRC-NEXT: movq %rdi, 24(%rax)
-; SRC-NEXT: popq %rbx
+; SRC-NEXT: xorps %xmm0, %xmm0
+; SRC-NEXT: movaps %xmm0, 16(%rax)
+; SRC-NEXT: movq $0, 8(%rax)
; SRC-NEXT: retq
;
; LIN-LABEL: PR25498:
; LIN: # %bb.0:
-; LIN-NEXT: pushq %rbx
; LIN-NEXT: movq %rdi, %rax
-; LIN-NEXT: movq %rsi, %rbx
-; LIN-NEXT: negq %rbx
+; LIN-NEXT: movq %rsi, %r11
+; LIN-NEXT: negq %r11
; LIN-NEXT: xorl %edi, %edi
-; LIN-NEXT: movl $0, %r11d
-; LIN-NEXT: sbbq %rdx, %r11
+; LIN-NEXT: movl $0, %r10d
+; LIN-NEXT: sbbq %rdx, %r10
; LIN-NEXT: movl $0, %r9d
; LIN-NEXT: sbbq %rcx, %r9
-; LIN-NEXT: movl $0, %r10d
-; LIN-NEXT: sbbq %r8, %r10
+; LIN-NEXT: sbbq %r8, %rdi
; LIN-NEXT: orq %rcx, %rsi
; LIN-NEXT: orq %r8, %rdx
; LIN-NEXT: orq %rsi, %rdx
; LIN-NEXT: je .LBB4_1
; LIN-NEXT: # %bb.2: # %cond.false
-; LIN-NEXT: bsrq %rbx, %rcx
+; LIN-NEXT: bsrq %r11, %rcx
; LIN-NEXT: xorq $63, %rcx
; LIN-NEXT: orq $64, %rcx
-; LIN-NEXT: bsrq %r11, %rdx
+; LIN-NEXT: bsrq %r10, %rdx
; LIN-NEXT: xorq $63, %rdx
-; LIN-NEXT: testq %r11, %r11
+; LIN-NEXT: testq %r10, %r10
; LIN-NEXT: cmoveq %rcx, %rdx
; LIN-NEXT: orq $128, %rdx
; LIN-NEXT: bsrq %r9, %rsi
; LIN-NEXT: xorq $63, %rsi
; LIN-NEXT: orq $64, %rsi
-; LIN-NEXT: bsrq %r10, %rcx
+; LIN-NEXT: bsrq %rdi, %rcx
; LIN-NEXT: xorq $63, %rcx
-; LIN-NEXT: testq %r10, %r10
+; LIN-NEXT: testq %rdi, %rdi
; LIN-NEXT: cmoveq %rsi, %rcx
-; LIN-NEXT: orq %r10, %r9
+; LIN-NEXT: orq %rdi, %r9
; LIN-NEXT: cmoveq %rdx, %rcx
-; LIN-NEXT: xorl %edi, %edi
; LIN-NEXT: jmp .LBB4_3
; LIN-NEXT: .LBB4_1:
; LIN-NEXT: movl $256, %ecx # imm = 0x100
; LIN-NEXT: .LBB4_3: # %cond.end
+; LIN-NEXT: xorps %xmm0, %xmm0
+; LIN-NEXT: movaps %xmm0, 16(%rax)
; LIN-NEXT: movq %rcx, (%rax)
-; LIN-NEXT: movq %rdi, 8(%rax)
-; LIN-NEXT: movq %rdi, 16(%rax)
-; LIN-NEXT: movq %rdi, 24(%rax)
-; LIN-NEXT: popq %rbx
+; LIN-NEXT: movq $0, 8(%rax)
; LIN-NEXT: retq
%b = sub i256 0, %a
%cmpz = icmp eq i256 %b, 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 6e0fa72398dda..d33507e1a5bb9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -268,13 +268,9 @@ define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) {
define i64 @PR55050() {
; X86-LABEL: PR55050:
; X86: # %bb.0: # %entry
-; X86-NEXT: xorl %eax, %eax
; X86-NEXT: testb %al, %al
-; X86-NEXT: jne .LBB15_2
-; X86-NEXT: # %bb.1: # %if
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB15_2: # %exit
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: PR55050:
More information about the llvm-commits
mailing list