[llvm] [DAG] Always allow folding XOR patterns to ABS pre-legalization (PR #94601)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 7 02:22:33 PDT 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/94601

>From fe0b9945547466fa1603eb3d863d730f34392144 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 6 Jun 2024 11:19:03 +0100
Subject: [PATCH] [DAG] Always allow folding XOR patterns to ABS
 pre-legalization

Removes residual ARM handling for vXi64 ABS nodes to prevent infinite loops.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |    4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   16 -
 llvm/lib/Target/ARM/ARMInstrNEON.td           |   16 -
 ...amdgpu-codegenprepare-fold-binop-select.ll |   48 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |  952 +++++++--------
 llvm/test/CodeGen/AMDGPU/bypass-div.ll        |   74 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          |  818 ++++++-------
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        |  994 +++++++--------
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  100 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll     |   20 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll        |   60 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          |  898 +++++++-------
 llvm/test/CodeGen/AMDGPU/sdiv.ll              | 1070 ++++++++---------
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |  294 +++--
 llvm/test/CodeGen/AMDGPU/srem64.ll            |  250 ++--
 .../X86/div-rem-pair-recomposition-signed.ll  |  298 ++---
 llvm/test/CodeGen/X86/pr38539.ll              |    2 +-
 17 files changed, 2864 insertions(+), 3050 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 02cd125eeff09..a913472525cf5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4042,7 +4042,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   }
 
   // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A)
-  if (hasOperation(ISD::ABS, VT) &&
+  if ((!LegalOperations || hasOperation(ISD::ABS, VT)) &&
       sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) &&
       sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1))))
     return DAG.getNode(ISD::ABS, DL, VT, A);
@@ -9526,7 +9526,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
-  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+  if (!LegalOperations || hasOperation(ISD::ABS, VT)) {
     SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
     SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
     if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 78aaaca4e185b..e3270471981cc 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1623,9 +1623,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setPrefFunctionAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
 
   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
-
-  if (Subtarget->isThumb() || Subtarget->isThumb2())
-    setTargetDAGCombine(ISD::ABS);
 }
 
 bool ARMTargetLowering::useSoftFloat() const {
@@ -13504,18 +13501,6 @@ static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
                          DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
 }
 
-static SDValue PerformABSCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARMSubtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
-  if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
-    return SDValue();
-
-  return TLI.expandABS(N, DAG);
-}
-
 /// PerformADDECombine - Target-specific dag combine transform from
 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
@@ -18879,7 +18864,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
   case ISD::SETCC:      return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
-  case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index c600478b66402..fcabc9076e4d3 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -5670,22 +5670,6 @@ def : Pat<(v2i64 (zext (abdu (v2i32 DPR:$opA), (v2i32 DPR:$opB)))),
           (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
 }
 
-// ISD::ABS is not legal for v2i64, so VABDL needs to be matched from the
-// shift/xor pattern for ABS.
-// TODO: Remove me.
-def abd_shr :
-    PatFrag<(ops node:$in1, node:$in2, node:$shift),
-            (ARMvshrsImm (sub (zext node:$in1),
-                            (zext node:$in2)), (i32 $shift))>;
-
-let Predicates = [HasNEON] in {
-def : Pat<(xor (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)),
-               (v2i64 (add (sub (zext (v2i32 DPR:$opA)),
-                                (zext (v2i32 DPR:$opB))),
-                           (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))),
-          (VABDLuv2i64 DPR:$opA, DPR:$opB)>;
-}
-
 //   VABA     : Vector Absolute Difference and Accumulate
 defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
                              "vaba", "s", abds, add>;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31..cb59121d69708 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -135,31 +135,31 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v1, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
+; GCN-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
+; GCN-NEXT:    v_max_i32_e32 v1, v0, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
+; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
@@ -217,31 +217,31 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 5, vcc
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v0
+; GCN-NEXT:    v_sub_u32_e32 v1, vcc, 0, v0
+; GCN-NEXT:    v_max_i32_e32 v1, v0, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0xf4240, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v0
+; GCN-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GCN-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 2ad28b8dd6ecf..8144fb7a3b646 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -245,39 +245,36 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s8
-; GFX6-NEXT:    s_xor_b32 s3, s3, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_ashr_i32 s9, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s2, s2, s9
+; GFX6-NEXT:    s_abs_i32 s8, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s4, 0, s8
 ; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_xor_b32 s1, s2, s3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_xor_b32 s0, s9, s8
+; GFX6-NEXT:    s_abs_i32 s0, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_mul_i32 s1, s1, s3
-; GFX6-NEXT:    s_sub_i32 s1, s2, s1
-; GFX6-NEXT:    s_sub_i32 s2, s1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s0, s0, s2
+; GFX6-NEXT:    s_sub_i32 s2, s0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_cselect_b32 s0, s2, s0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s8
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s1, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s1, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -286,16 +283,13 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s5
-; GFX9-NEXT:    s_xor_b32 s4, s5, s4
+; GFX9-NEXT:    s_abs_i32 s4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s2, s3
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
@@ -303,18 +297,18 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
 ; GFX9-NEXT:    s_add_i32 s6, s6, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT:    s_mul_i32 s6, s5, s3
+; GFX9-NEXT:    s_mul_i32 s6, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
 ; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_sub_i32 s6, s2, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_sub_i32 s6, s2, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
 ; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX9-NEXT:    s_add_i32 s6, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s4
 ; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -366,37 +360,36 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-LABEL: srem_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b32 s4, s3, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT:    s_sub_i32 s3, 0, s4
-; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s5
+; GFX6-NEXT:    s_abs_i32 s3, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
+; GFX6-NEXT:    s_abs_i32 s8, s2
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s6, s2, s5
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_mul_i32 s7, s7, s4
-; GFX6-NEXT:    s_sub_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s7, s6, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
-; GFX6-NEXT:    s_sub_i32 s7, s6, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s7, s6
-; GFX6-NEXT:    s_xor_b32 s4, s4, s5
-; GFX6-NEXT:    s_sub_i32 s4, s4, s5
-; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX6-NEXT:    s_mul_i32 s1, s1, s3
+; GFX6-NEXT:    s_sub_i32 s1, s8, s1
+; GFX6-NEXT:    s_sub_i32 s2, s1, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_sub_i32 s2, s1, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    s_sub_i32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_i32:
@@ -404,15 +397,12 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_abs_i32 s3, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
@@ -1857,126 +1847,114 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_mov_b32 s19, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s18, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s12, 31
-; GFX6-NEXT:    s_add_i32 s3, s12, s2
-; GFX6-NEXT:    s_xor_b32 s3, s3, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
+; GFX6-NEXT:    s_abs_i32 s2, s12
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
+; GFX6-NEXT:    s_xor_b32 s4, s8, s12
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s8, 31
-; GFX6-NEXT:    s_add_i32 s5, s8, s4
-; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_abs_i32 s3, s8
+; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_xor_b32 s8, s4, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s2, s5, s2
-; GFX6-NEXT:    s_sub_i32 s4, s2, s3
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX6-NEXT:    s_mul_i32 s4, s4, s2
+; GFX6-NEXT:    s_sub_i32 s3, s3, s4
+; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_ashr_i32 s4, s13, 31
-; GFX6-NEXT:    s_add_i32 s5, s13, s4
-; GFX6-NEXT:    s_xor_b32 s5, s5, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX6-NEXT:    s_sub_i32 s6, 0, s5
+; GFX6-NEXT:    s_abs_i32 s4, s13
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX6-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_xor_b32 s6, s9, s13
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v2
-; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX6-NEXT:    s_add_i32 s7, s9, s6
-; GFX6-NEXT:    s_xor_b32 s7, s7, s6
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s5, v2
+; GFX6-NEXT:    s_abs_i32 s5, s9
+; GFX6-NEXT:    s_ashr_i32 s9, s6, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    s_xor_b32 s9, s6, s4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX6-NEXT:    s_mul_i32 s4, s4, s5
-; GFX6-NEXT:    s_sub_i32 s4, s7, s4
-; GFX6-NEXT:    s_sub_i32 s6, s4, s5
-; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX6-NEXT:    s_mul_i32 s6, s6, s4
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_sub_i32 s6, s5, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
-; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_ashr_i32 s6, s14, 31
-; GFX6-NEXT:    s_add_i32 s7, s14, s6
-; GFX6-NEXT:    s_xor_b32 s7, s7, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s7
-; GFX6-NEXT:    s_sub_i32 s12, 0, s7
+; GFX6-NEXT:    s_abs_i32 s6, s14
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GFX6-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GFX6-NEXT:    v_xor_b32_e32 v2, s9, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v4
-; GFX6-NEXT:    s_ashr_i32 s12, s10, 31
-; GFX6-NEXT:    s_add_i32 s10, s10, s12
-; GFX6-NEXT:    s_xor_b32 s10, s10, s12
+; GFX6-NEXT:    v_mul_lo_u32 v5, s7, v4
+; GFX6-NEXT:    s_abs_i32 s7, s10
+; GFX6-NEXT:    s_xor_b32 s10, s10, s14
+; GFX6-NEXT:    s_ashr_i32 s10, s10, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX6-NEXT:    s_xor_b32 s12, s12, s6
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v4
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v4
-; GFX6-NEXT:    s_mul_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s6, s10, s6
-; GFX6-NEXT:    s_sub_i32 s10, s6, s7
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s7
+; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v4
+; GFX6-NEXT:    s_mul_i32 s12, s12, s6
+; GFX6-NEXT:    s_sub_i32 s7, s7, s12
+; GFX6-NEXT:    s_sub_i32 s12, s7, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX6-NEXT:    s_cselect_b32 s7, s12, s7
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s6, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_ashr_i32 s10, s15, 31
-; GFX6-NEXT:    s_add_i32 s13, s15, s10
-; GFX6-NEXT:    s_xor_b32 s13, s13, s10
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s13
-; GFX6-NEXT:    s_sub_i32 s0, 0, s13
+; GFX6-NEXT:    s_abs_i32 s12, s15
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s12
+; GFX6-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
-; GFX6-NEXT:    v_xor_b32_e32 v4, s12, v4
+; GFX6-NEXT:    s_abs_i32 s1, s11
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v3
-; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX6-NEXT:    s_add_i32 s1, s11, s0
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX6-NEXT:    s_xor_b32 s0, s0, s10
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v3, s1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s12, v4
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX6-NEXT:    s_mul_i32 s2, s2, s13
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v5, s[6:7]
+; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v6
+; GFX6-NEXT:    s_xor_b32 s0, s11, s15
+; GFX6-NEXT:    v_xor_b32_e32 v3, s10, v3
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    v_mul_hi_u32 v2, v6, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX6-NEXT:    s_mul_i32 s2, s2, s12
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_sub_i32 s2, s1, s13
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s13
+; GFX6-NEXT:    s_sub_i32 s2, s1, s12
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s12
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; GFX6-NEXT:    s_cmp_ge_u32 s1, s13
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s12
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
@@ -1990,16 +1968,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX9-NEXT:    s_add_i32 s3, s8, s2
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX9-NEXT:    s_add_i32 s4, s4, s8
-; GFX9-NEXT:    s_xor_b32 s2, s8, s2
+; GFX9-NEXT:    s_abs_i32 s2, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    s_xor_b32 s3, s4, s8
+; GFX9-NEXT:    s_sub_i32 s8, 0, s2
+; GFX9-NEXT:    s_abs_i32 s4, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s4, s4, s8
-; GFX9-NEXT:    s_sub_i32 s8, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
@@ -2007,108 +1982,99 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
 ; GFX9-NEXT:    s_add_i32 s12, s12, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s12
-; GFX9-NEXT:    s_mul_i32 s12, s8, s3
+; GFX9-NEXT:    s_mul_i32 s12, s8, s2
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s12
 ; GFX9-NEXT:    s_add_i32 s13, s8, 1
-; GFX9-NEXT:    s_sub_i32 s12, s4, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_sub_i32 s12, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
 ; GFX9-NEXT:    s_cselect_b32 s8, s13, s8
 ; GFX9-NEXT:    s_cselect_b32 s4, s12, s4
 ; GFX9-NEXT:    s_add_i32 s12, s8, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
-; GFX9-NEXT:    s_cselect_b32 s3, s12, s8
-; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_i32 s8, s9, s4
-; GFX9-NEXT:    s_xor_b32 s8, s8, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s12, s8
+; GFX9-NEXT:    s_abs_i32 s4, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s8, s5, s9
+; GFX9-NEXT:    s_sub_i32 s9, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s4, s9, s4
-; GFX9-NEXT:    s_sub_i32 s2, s3, s2
-; GFX9-NEXT:    s_xor_b32 s3, s5, s9
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_abs_i32 s5, s5
+; GFX9-NEXT:    s_ashr_i32 s8, s8, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s5, 0, s8
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s5
-; GFX9-NEXT:    s_add_i32 s9, s9, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s9
-; GFX9-NEXT:    s_mul_i32 s9, s5, s8
-; GFX9-NEXT:    s_sub_i32 s3, s3, s9
-; GFX9-NEXT:    s_add_i32 s12, s5, 1
-; GFX9-NEXT:    s_sub_i32 s9, s3, s8
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s12, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s9, s9, s3
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s9
+; GFX9-NEXT:    s_add_i32 s3, s3, s9
+; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX9-NEXT:    s_mul_i32 s9, s3, s4
+; GFX9-NEXT:    s_sub_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s12, s3, 1
+; GFX9-NEXT:    s_sub_i32 s9, s5, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s12, s3
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_add_i32 s9, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s9, s3
-; GFX9-NEXT:    s_add_i32 s9, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
-; GFX9-NEXT:    s_cselect_b32 s3, s9, s5
-; GFX9-NEXT:    s_ashr_i32 s5, s10, 31
-; GFX9-NEXT:    s_add_i32 s8, s10, s5
-; GFX9-NEXT:    s_xor_b32 s8, s8, s5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    s_ashr_i32 s9, s6, 31
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    s_add_i32 s6, s6, s9
+; GFX9-NEXT:    s_abs_i32 s4, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s8
+; GFX9-NEXT:    s_sub_i32 s9, 0, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s5, s9, s5
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s4, s6, s9
+; GFX9-NEXT:    s_xor_b32 s5, s6, s10
+; GFX9-NEXT:    s_abs_i32 s6, s6
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s6, 0, s8
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s6, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s6, s9, s6
-; GFX9-NEXT:    s_add_i32 s9, s9, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s9
-; GFX9-NEXT:    s_mul_i32 s9, s6, s8
-; GFX9-NEXT:    s_sub_i32 s4, s4, s9
-; GFX9-NEXT:    s_add_i32 s10, s6, 1
-; GFX9-NEXT:    s_sub_i32 s9, s4, s8
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_add_i32 s9, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s8
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
-; GFX9-NEXT:    s_add_i32 s8, s11, s6
-; GFX9-NEXT:    s_xor_b32 s8, s8, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s9, s9, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s9
+; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s8
+; GFX9-NEXT:    s_mul_i32 s9, s8, s4
+; GFX9-NEXT:    s_sub_i32 s6, s6, s9
+; GFX9-NEXT:    s_add_i32 s10, s8, 1
+; GFX9-NEXT:    s_sub_i32 s9, s6, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX9-NEXT:    s_add_i32 s9, s8, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s8
+; GFX9-NEXT:    s_abs_i32 s6, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s7, s7, s2
-; GFX9-NEXT:    s_xor_b32 s6, s2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_xor_b32 s2, s7, s11
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    s_abs_i32 s3, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s5
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_xor_b32 s2, s7, s2
-; GFX9-NEXT:    s_sub_i32 s5, 0, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s7
-; GFX9-NEXT:    s_mul_i32 s7, s5, s8
-; GFX9-NEXT:    s_sub_i32 s2, s2, s7
-; GFX9-NEXT:    s_add_i32 s9, s5, 1
-; GFX9-NEXT:    s_sub_i32 s7, s2, s8
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s7
+; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_sub_i32 s7, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
 ; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
-; GFX9-NEXT:    s_cselect_b32 s2, s7, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s6
-; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -2279,116 +2245,104 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX6-NEXT:    s_add_i32 s3, s8, s2
-; GFX6-NEXT:    s_xor_b32 s2, s3, s2
+; GFX6-NEXT:    s_abs_i32 s2, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_ashr_i32 s3, s4, 31
-; GFX6-NEXT:    s_add_i32 s4, s4, s3
-; GFX6-NEXT:    s_xor_b32 s4, s4, s3
+; GFX6-NEXT:    s_abs_i32 s3, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX6-NEXT:    s_mul_i32 s8, s8, s2
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    s_sub_i32 s8, s4, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX6-NEXT:    s_sub_i32 s8, s4, s2
-; GFX6-NEXT:    s_cmp_ge_u32 s4, s2
-; GFX6-NEXT:    s_cselect_b32 s2, s8, s4
-; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX6-NEXT:    s_add_i32 s8, s9, s4
-; GFX6-NEXT:    s_xor_b32 s4, s8, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT:    s_sub_i32 s8, 0, s4
-; GFX6-NEXT:    s_xor_b32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s9, s2, s3
+; GFX6-NEXT:    s_sub_i32 s3, s3, s8
+; GFX6-NEXT:    s_sub_i32 s8, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX6-NEXT:    s_sub_i32 s8, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s2, s8, s3
+; GFX6-NEXT:    s_abs_i32 s3, s9
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s8, 0, s3
+; GFX6-NEXT:    s_xor_b32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s4, s2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v0
-; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, s8
-; GFX6-NEXT:    s_xor_b32 s5, s5, s8
+; GFX6-NEXT:    s_abs_i32 s8, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s2, s2, s4
-; GFX6-NEXT:    s_sub_i32 s2, s5, s2
-; GFX6-NEXT:    s_sub_i32 s3, s2, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX6-NEXT:    s_sub_i32 s3, s2, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
-; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
-; GFX6-NEXT:    s_add_i32 s4, s10, s3
-; GFX6-NEXT:    s_xor_b32 s3, s4, s3
+; GFX6-NEXT:    s_mul_i32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s8, s2
+; GFX6-NEXT:    s_sub_i32 s8, s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX6-NEXT:    s_sub_i32 s8, s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX6-NEXT:    s_abs_i32 s3, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s4, 0, s3
-; GFX6-NEXT:    s_xor_b32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s8, 0, s3
+; GFX6-NEXT:    s_xor_b32 s2, s2, s5
+; GFX6-NEXT:    s_sub_i32 s5, s2, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
-; GFX6-NEXT:    s_add_i32 s5, s6, s4
-; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v0
+; GFX6-NEXT:    s_abs_i32 s8, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_sub_i32 s6, s2, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s2, s3
-; GFX6-NEXT:    s_sub_i32 s2, s5, s2
-; GFX6-NEXT:    s_sub_i32 s5, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s8, s2
+; GFX6-NEXT:    s_sub_i32 s8, s2, s3
 ; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX6-NEXT:    s_sub_i32 s5, s2, s3
+; GFX6-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX6-NEXT:    s_sub_i32 s8, s2, s3
 ; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s2
-; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX6-NEXT:    s_add_i32 s3, s11, s2
-; GFX6-NEXT:    s_xor_b32 s8, s3, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_sub_i32 s10, 0, s8
-; GFX6-NEXT:    s_xor_b32 s5, s5, s4
-; GFX6-NEXT:    s_sub_i32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_cselect_b32 s8, s8, s2
+; GFX6-NEXT:    s_abs_i32 s9, s11
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX6-NEXT:    s_sub_i32 s2, 0, s9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s9
-; GFX6-NEXT:    s_ashr_i32 s9, s7, 31
-; GFX6-NEXT:    s_add_i32 s7, s7, s9
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT:    s_xor_b32 s7, s7, s9
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s6
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX6-NEXT:    s_mul_i32 s5, s5, s8
-; GFX6-NEXT:    s_sub_i32 s5, s7, s5
-; GFX6-NEXT:    s_sub_i32 s6, s5, s8
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s8
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
-; GFX6-NEXT:    s_sub_i32 s6, s5, s8
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s8
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
-; GFX6-NEXT:    s_xor_b32 s5, s5, s9
-; GFX6-NEXT:    s_sub_i32 s5, s5, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s5
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_abs_i32 s4, s7
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v2
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    s_ashr_i32 s5, s7, 31
+; GFX6-NEXT:    s_xor_b32 s7, s8, s6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v2, s4, v2
+; GFX6-NEXT:    s_sub_i32 s6, s7, s6
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX6-NEXT:    s_mul_i32 s7, s7, s9
+; GFX6-NEXT:    s_sub_i32 s4, s4, s7
+; GFX6-NEXT:    s_sub_i32 s7, s4, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s7, s4, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -2398,15 +2352,12 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX9-NEXT:    s_add_i32 s3, s8, s2
-; GFX9-NEXT:    s_xor_b32 s2, s3, s2
+; GFX9-NEXT:    s_abs_i32 s2, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_sub_i32 s8, 0, s2
 ; GFX9-NEXT:    s_ashr_i32 s3, s4, 31
-; GFX9-NEXT:    s_add_i32 s4, s4, s3
+; GFX9-NEXT:    s_abs_i32 s4, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s4, s4, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
@@ -2422,51 +2373,44 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_sub_i32 s8, s4, s2
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
 ; GFX9-NEXT:    s_cselect_b32 s2, s8, s4
-; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_i32 s8, s9, s4
-; GFX9-NEXT:    s_xor_b32 s4, s8, s4
+; GFX9-NEXT:    s_abs_i32 s4, s9
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_add_i32 s5, s5, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s9, 0, s4
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s3, s5, s8
-; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_abs_i32 s5, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s5
-; GFX9-NEXT:    s_add_i32 s9, s9, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s9
-; GFX9-NEXT:    s_mul_i32 s5, s5, s4
-; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s9, s9, s3
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s9
+; GFX9-NEXT:    s_add_i32 s3, s3, s9
+; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX9-NEXT:    s_mul_i32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s3, s5, s3
 ; GFX9-NEXT:    s_sub_i32 s5, s3, s4
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
 ; GFX9-NEXT:    s_sub_i32 s5, s3, s4
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX9-NEXT:    s_ashr_i32 s4, s10, 31
-; GFX9-NEXT:    s_add_i32 s5, s10, s4
-; GFX9-NEXT:    s_xor_b32 s4, s5, s4
+; GFX9-NEXT:    s_abs_i32 s4, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s8
+; GFX9-NEXT:    s_sub_i32 s9, 0, s4
 ; GFX9-NEXT:    s_sub_i32 s3, s3, s8
-; GFX9-NEXT:    s_sub_i32 s8, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    s_xor_b32 s6, s6, s5
+; GFX9-NEXT:    s_abs_i32 s6, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s8, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s8, s9, s8
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s9, s9, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s9
+; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s8, s4
 ; GFX9-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9-NEXT:    s_sub_i32 s8, s6, s4
@@ -2475,36 +2419,34 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_sub_i32 s8, s6, s4
 ; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
-; GFX9-NEXT:    s_add_i32 s8, s11, s6
-; GFX9-NEXT:    s_xor_b32 s6, s8, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_abs_i32 s6, s11
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX9-NEXT:    s_xor_b32 s3, s4, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    s_add_i32 s4, s7, s2
-; GFX9-NEXT:    s_sub_i32 s3, s3, s5
-; GFX9-NEXT:    s_sub_i32 s5, 0, s6
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    s_abs_i32 s3, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_xor_b32 s4, s4, s2
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s7
-; GFX9-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s5
-; GFX9-NEXT:    s_sub_i32 s5, s4, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX9-NEXT:    s_sub_i32 s5, s4, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
-; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
-; GFX9-NEXT:    s_xor_b32 s4, s4, s2
-; GFX9-NEXT:    s_sub_i32 s2, s4, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX9-NEXT:    s_mul_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    s_sub_i32 s5, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_sub_i32 s5, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -6538,71 +6480,65 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
-; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s3
-; GFX6-NEXT:    s_xor_b32 s2, s2, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s2
+; GFX6-NEXT:    s_abs_i32 s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s6, 0, s3
+; GFX6-NEXT:    s_xor_b32 s2, s4, s2
 ; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
-; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
-; GFX6-NEXT:    s_add_i32 s4, s4, s6
-; GFX6-NEXT:    s_xor_b32 s4, s4, s6
+; GFX6-NEXT:    s_abs_i32 s6, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_xor_b32 s6, s6, s3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_mul_i32 s3, s3, s2
-; GFX6-NEXT:    s_sub_i32 s3, s4, s3
-; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s6, s2
+; GFX6-NEXT:    s_sub_i32 s6, s2, s3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX6-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
-; GFX6-NEXT:    s_add_i32 s7, s7, s4
-; GFX6-NEXT:    s_xor_b32 s7, s7, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
-; GFX6-NEXT:    s_sub_i32 s8, 0, s7
+; GFX6-NEXT:    s_abs_i32 s6, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    s_xor_b32 s7, s5, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_abs_i32 s5, s5
+; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    s_ashr_i32 s7, s7, 31
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, s8
-; GFX6-NEXT:    s_xor_b32 s5, s5, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v3
-; GFX6-NEXT:    s_xor_b32 s4, s8, s4
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX6-NEXT:    s_mul_i32 s6, s6, s7
-; GFX6-NEXT:    s_sub_i32 s5, s5, s6
-; GFX6-NEXT:    s_sub_i32 s6, s5, s7
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX6-NEXT:    s_mul_i32 s4, s4, s6
+; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s6
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s6
 ; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_xor_b32_e32 v1, s7, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s7, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -6613,65 +6549,59 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s6
-; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    s_abs_i32 s3, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
+; GFX9-NEXT:    s_abs_i32 s7, s4
+; GFX9-NEXT:    s_xor_b32 s2, s4, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s3, s7, s3
-; GFX9-NEXT:    s_xor_b32 s4, s4, s7
-; GFX9-NEXT:    s_sub_i32 s7, 0, s2
+; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s7, s7, s8
-; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
-; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s8
-; GFX9-NEXT:    s_mul_i32 s8, s7, s2
-; GFX9-NEXT:    s_sub_i32 s4, s4, s8
-; GFX9-NEXT:    s_add_i32 s9, s7, 1
-; GFX9-NEXT:    s_sub_i32 s8, s4, s2
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
-; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
-; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX9-NEXT:    s_add_i32 s8, s7, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
-; GFX9-NEXT:    s_cselect_b32 s2, s8, s7
-; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s4
-; GFX9-NEXT:    s_xor_b32 s6, s6, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_add_i32 s8, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s7, s8
+; GFX9-NEXT:    s_mul_i32 s8, s4, s3
+; GFX9-NEXT:    s_sub_i32 s7, s7, s8
+; GFX9-NEXT:    s_add_i32 s9, s4, 1
+; GFX9-NEXT:    s_sub_i32 s8, s7, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_cselect_b32 s7, s8, s7
+; GFX9-NEXT:    s_add_i32 s8, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s4
+; GFX9-NEXT:    s_abs_i32 s4, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_sub_i32 s7, 0, s4
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s4, s7, s4
-; GFX9-NEXT:    s_sub_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s3, s5, s7
+; GFX9-NEXT:    s_xor_b32 s6, s5, s6
+; GFX9-NEXT:    s_abs_i32 s5, s5
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s5, 0, s6
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s7
-; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
-; GFX9-NEXT:    s_add_i32 s7, s7, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s7
-; GFX9-NEXT:    s_mul_i32 s7, s5, s6
-; GFX9-NEXT:    s_sub_i32 s3, s3, s7
-; GFX9-NEXT:    s_add_i32 s8, s5, 1
-; GFX9-NEXT:    s_sub_i32 s7, s3, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s3
+; GFX9-NEXT:    s_mul_hi_u32 s7, s3, s7
+; GFX9-NEXT:    s_add_i32 s3, s3, s7
+; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX9-NEXT:    s_mul_i32 s7, s3, s4
+; GFX9-NEXT:    s_sub_i32 s5, s5, s7
+; GFX9-NEXT:    s_add_i32 s8, s3, 1
+; GFX9-NEXT:    s_sub_i32 s7, s5, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
 ; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
-; GFX9-NEXT:    s_add_i32 s7, s5, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
-; GFX9-NEXT:    s_cselect_b32 s3, s7, s5
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
-; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -7001,19 +6931,16 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
-; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s3
-; GFX6-NEXT:    s_xor_b32 s2, s2, s3
+; GFX6-NEXT:    s_abs_i32 s2, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_sub_i32 s3, 0, s2
-; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_add_i32 s3, s4, s6
-; GFX6-NEXT:    s_xor_b32 s3, s3, s6
-; GFX6-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX6-NEXT:    s_abs_i32 s3, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
@@ -7026,38 +6953,35 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_sub_i32 s7, s3, s2
 ; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
 ; GFX6-NEXT:    s_cselect_b32 s7, s7, s3
-; GFX6-NEXT:    s_ashr_i32 s2, s4, 31
-; GFX6-NEXT:    s_add_i32 s4, s4, s2
-; GFX6-NEXT:    s_xor_b32 s4, s4, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT:    s_sub_i32 s2, 0, s4
-; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX6-NEXT:    s_xor_b32 s7, s7, s6
+; GFX6-NEXT:    s_abs_i32 s6, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    s_abs_i32 s8, s5
+; GFX6-NEXT:    s_xor_b32 s7, s7, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s6, s7, s6
+; GFX6-NEXT:    s_sub_i32 s4, s7, s4
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
-; GFX6-NEXT:    s_add_i32 s2, s5, s8
-; GFX6-NEXT:    s_xor_b32 s5, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_mul_i32 s7, s7, s4
-; GFX6-NEXT:    s_sub_i32 s5, s5, s7
-; GFX6-NEXT:    s_sub_i32 s7, s5, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
-; GFX6-NEXT:    s_sub_i32 s7, s5, s4
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
-; GFX6-NEXT:    s_cselect_b32 s4, s7, s5
-; GFX6-NEXT:    s_xor_b32 s4, s4, s8
-; GFX6-NEXT:    s_sub_i32 s4, s4, s8
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
+; GFX6-NEXT:    s_mul_i32 s7, s7, s6
+; GFX6-NEXT:    s_sub_i32 s7, s8, s7
+; GFX6-NEXT:    s_sub_i32 s8, s7, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
+; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
+; GFX6-NEXT:    s_sub_i32 s8, s7, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s7, s6
+; GFX6-NEXT:    s_cselect_b32 s6, s8, s7
+; GFX6-NEXT:    s_xor_b32 s6, s6, s5
+; GFX6-NEXT:    s_sub_i32 s5, s6, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -7068,16 +6992,13 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s6
-; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s7
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_add_i32 s4, s4, s6
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
+; GFX9-NEXT:    s_abs_i32 s4, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7093,24 +7014,21 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_sub_i32 s7, s4, s2
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
 ; GFX9-NEXT:    s_cselect_b32 s2, s7, s4
-; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_abs_i32 s3, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9-NEXT:    s_sub_i32 s7, 0, s3
 ; GFX9-NEXT:    s_sub_i32 s2, s2, s6
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s5, 31
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    s_xor_b32 s5, s5, s4
+; GFX9-NEXT:    s_abs_i32 s5, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX9-NEXT:    s_mul_i32 s6, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT:    s_add_i32 s7, s7, s6
-; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
 ; GFX9-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX9-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX9-NEXT:    s_sub_i32 s6, s5, s3
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 4d8687b141a79..5bbea7ecf3f2d 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -575,34 +575,33 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-LABEL: sdiv32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
+; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
+; GFX9-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
+; GFX9-NEXT:    v_sub_u32_e32 v5, 0, v0
+; GFX9-NEXT:    v_max_i32_e32 v5, v0, v5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, 1, v3
+; GFX9-NEXT:    v_sub_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v3, v4, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i32 %a, %b
   ret i32 %d
@@ -640,31 +639,30 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-LABEL: srem32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, 0, v1
+; GFX9-NEXT:    v_max_i32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v0
+; GFX9-NEXT:    v_max_i32_e32 v4, v0, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
+; GFX9-NEXT:    v_sub_u32_e32 v2, v4, v2
+; GFX9-NEXT:    v_sub_u32_e32 v3, v2, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v3, v2, v1
+; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i32 %a, %b
   ret i32 %d
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 7d8eba1e87080..f0ab3a5342e01 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -9,26 +9,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_sdiv_i128_vv:
 ; GFX9:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v16, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, v16, v1
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v16
-; GFX9-NEXT:    v_xor_b32_e32 v2, v16, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
-; GFX9-NEXT:    v_xor_b32_e32 v3, v16, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v2, v16, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v3, v16, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, v17, v4
-; GFX9-NEXT:    v_xor_b32_e32 v2, v17, v5
-; GFX9-NEXT:    v_sub_co_u32_e32 v20, vcc, v3, v17
-; GFX9-NEXT:    v_xor_b32_e32 v0, v17, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v21, vcc, v2, v17, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, v17, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v17, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v17, vcc
-; GFX9-NEXT:    v_or_b32_e32 v3, v21, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, v20, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v3, v20, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, v21, v0
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v9, v11
 ; GFX9-NEXT:    v_or_b32_e32 v2, v8, v10
@@ -37,35 +39,35 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
 ; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
 ; GFX9-NEXT:    v_min_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_ffbh_u32_e32 v3, v20
+; GFX9-NEXT:    v_ffbh_u32_e32 v3, v21
 ; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_ffbh_u32_e32 v4, v21
+; GFX9-NEXT:    v_ffbh_u32_e32 v4, v20
 ; GFX9-NEXT:    v_min_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_ffbh_u32_e32 v5, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_ffbh_u32_e32 v3, v10
 ; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_ffbh_u32_e32 v5, v8
-; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v9
-; GFX9-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v6
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v9
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 64, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 64, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v6, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v18, v16
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
@@ -138,8 +140,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v12, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v20
-; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v21, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v21
+; GFX9-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v20, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v14, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0
@@ -164,9 +166,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, v28, v10, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, v29, v11, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v30, 31, v14
-; GFX9-NEXT:    v_and_b32_e32 v14, v30, v20
-; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v14
 ; GFX9-NEXT:    v_and_b32_e32 v14, v30, v21
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v14
+; GFX9-NEXT:    v_and_b32_e32 v14, v30, v20
 ; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v14, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v14, v30, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v14, vcc
@@ -218,238 +220,244 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-O0-NEXT:    ; kill: def $vgpr21 killed $vgpr21 def $vgpr21_vgpr22 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v4
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v2
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[11:12]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v14
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v6, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v13
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v10
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v6, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v4, v5
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v4
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v4, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v11
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s10, 2
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, s10, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v3, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v14, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v20, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[10:11], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v20, v1, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v14, v1, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_xor_b32_e64 v11, v3, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v8
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v5, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v3, v2
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v12
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v5, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v3, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v2, v5, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v22
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v18, vcc, s10, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v8, v9, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v13, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v8, v15, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[21:22], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v15, v8, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v7
-; GFX9-O0-NEXT:    v_xor_b32_e64 v5, v5, v6
-; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
-; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    v_xor_b32_e64 v15, v15, v20
+; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v13, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    s_mov_b32 s4, 63
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[13:14], s4, v[13:14]
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-O0-NEXT:    s_mov_b32 s9, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9]
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
-; GFX9-O0-NEXT:    s_mov_b32 s12, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
-; GFX9-O0-NEXT:    s_mov_b32 s14, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b32 s13, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v8, v8, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
+; GFX9-O0-NEXT:    v_min_u32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    s_mov_b32 s12, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v10
+; GFX9-O0-NEXT:    v_min_u32_e64 v13, v7, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    s_mov_b32 s16, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b32 s18, s15
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[16:17], v10, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s18
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9]
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
 ; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
-; GFX9-O0-NEXT:    s_mov_b32 s8, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
-; GFX9-O0-NEXT:    s_mov_b32 s10, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    v_min_u32_e64 v12, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    s_mov_b32 s12, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    s_mov_b32 s14, s15
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[12:13], v11, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -462,8 +470,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    s_mov_b32 s11, s7
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
@@ -546,75 +552,75 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -647,67 +653,67 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 11
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -847,72 +853,72 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
@@ -993,51 +999,51 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -1074,14 +1080,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1127,12 +1133,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
@@ -1147,39 +1153,39 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1224,11 +1230,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 16a03badcb132..e04cd71125608 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -6,185 +6,187 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-LABEL: v_sdiv_v2i128_vv:
 ; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
-; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_ashrrev_i32_e32 v26, 31, v3
+; SDAG-NEXT:    v_ashrrev_i32_e32 v27, 31, v11
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_mov_b32_e32 v26, v24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v25
-; SDAG-NEXT:    v_xor_b32_e32 v17, v24, v3
-; SDAG-NEXT:    v_xor_b32_e32 v18, v24, v2
-; SDAG-NEXT:    v_xor_b32_e32 v1, v24, v1
-; SDAG-NEXT:    v_xor_b32_e32 v0, v24, v0
-; SDAG-NEXT:    v_xor_b32_e32 v19, v25, v11
-; SDAG-NEXT:    v_xor_b32_e32 v20, v25, v10
-; SDAG-NEXT:    v_xor_b32_e32 v9, v25, v9
-; SDAG-NEXT:    v_xor_b32_e32 v8, v25, v8
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v0, v24
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v1, v24, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v0, v2
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v18, v24, vcc
-; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v0
-; SDAG-NEXT:    v_ffbh_u32_e32 v18, v3
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v17, v24, vcc
-; SDAG-NEXT:    v_or_b32_e32 v0, v2, v10
-; SDAG-NEXT:    v_ffbh_u32_e32 v17, v10
-; SDAG-NEXT:    v_min_u32_e32 v18, v1, v18
-; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v8, v25
-; SDAG-NEXT:    v_or_b32_e32 v1, v3, v11
-; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], 32, v17
-; SDAG-NEXT:    v_ffbh_u32_e32 v17, v11
-; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 64, v18
-; SDAG-NEXT:    v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v9, v25, vcc
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
-; SDAG-NEXT:    v_min_u32_e32 v8, v8, v17
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v20, v25, vcc
-; SDAG-NEXT:    v_add_i32_e64 v9, s[8:9], 32, v1
-; SDAG-NEXT:    v_ffbh_u32_e32 v20, v29
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v8, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v19, v25, vcc
-; SDAG-NEXT:    v_or_b32_e32 v8, v28, v0
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v0
-; SDAG-NEXT:    v_min_u32_e32 v20, v9, v20
-; SDAG-NEXT:    v_or_b32_e32 v9, v29, v1
-; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 32, v19
+; SDAG-NEXT:    v_mov_b32_e32 v28, v26
+; SDAG-NEXT:    v_mov_b32_e32 v29, v27
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v1, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v0, v16, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v16
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v17
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, 0, v8
+; SDAG-NEXT:    v_or_b32_e32 v0, v16, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v2
+; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], 32, v1
+; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v17, v3
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], 32, v21
+; SDAG-NEXT:    v_min_u32_e32 v18, v22, v18
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v3
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v30, v9, v23, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v10, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v31, v8, v20, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_min_u32_e32 v1, v21, v22
+; SDAG-NEXT:    v_add_i32_e64 v8, s[8:9], 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v10, v9, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v9, v31
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v18, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v8, v1, vcc
+; SDAG-NEXT:    v_ffbh_u32_e32 v21, v30
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v11, v20, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v8, v31, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 32, v9
+; SDAG-NEXT:    v_or_b32_e32 v9, v30, v1
+; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; SDAG-NEXT:    v_min_u32_e32 v20, v20, v21
 ; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
-; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 64, v20
-; SDAG-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT:    v_min_u32_e32 v8, v19, v21
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v22, 0, s[6:7]
-; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[6:7]
+; SDAG-NEXT:    v_min_u32_e32 v8, v11, v21
+; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 64, v20
+; SDAG-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v8, v18
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v9, v17, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v17, 0x7f, v8
-; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v16, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v11, v10, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v10, 0x7f, v8
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v16, vcc
-; SDAG-NEXT:    v_or_b32_e32 v16, v17, v18
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v18
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v9, v19
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v19
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v20
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v20
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v20, v3, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v10, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, v3, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, v17, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v23, v16, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_6
 ; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v8
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v8
 ; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v8
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
-; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v9, vcc
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[2:3], v20
-; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v19, vcc
-; SDAG-NEXT:    v_or_b32_e32 v18, v30, v32
-; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v8
-; SDAG-NEXT:    v_or_b32_e32 v19, v31, v33
-; SDAG-NEXT:    v_lshl_b64 v[8:9], v[10:11], v34
-; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v34
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[16:17], v20
+; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v18, v32, v34
+; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v8
+; SDAG-NEXT:    v_or_b32_e32 v19, v33, v35
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[2:3], v24
+; SDAG-NEXT:    v_sub_i32_e32 v25, vcc, 64, v24
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[16:17], v24
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT:    v_lshr_b64 v[18:19], v[2:3], v35
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[16:17], v25
 ; SDAG-NEXT:    v_or_b32_e32 v9, v9, v19
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v21, v9, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_5
 ; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT:    v_lshr_b64 v[16:17], v[2:3], v30
-; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
-; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT:    v_lshr_b64 v[37:38], v[10:11], v30
-; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[16:17], v32
+; SDAG-NEXT:    v_sub_i32_e32 v37, vcc, 64, v32
+; SDAG-NEXT:    v_subrev_i32_e32 v48, vcc, 64, v32
+; SDAG-NEXT:    v_lshr_b64 v[24:25], v[2:3], v32
+; SDAG-NEXT:    v_add_i32_e32 v36, vcc, -1, v31
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
-; SDAG-NEXT:    v_lshl_b64 v[48:49], v[10:11], v35
-; SDAG-NEXT:    v_lshr_b64 v[10:11], v[10:11], v36
-; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v17, v49
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v48
-; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v11, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v10, v16, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v38, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v37, s[4:5]
-; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
-; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v17, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_lshl_b64 v[38:39], v[2:3], v37
+; SDAG-NEXT:    v_lshr_b64 v[2:3], v[2:3], v48
+; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v30, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v11, v39
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v38
+; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v0, vcc
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v32
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, v25, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v24, s[4:5]
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v1, vcc
+; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
+; SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v16, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
 ; SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[24:25], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 31, v3
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v9
 ; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v21
+; SDAG-NEXT:    v_lshrrev_b32_e32 v25, 31, v21
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v16
-; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT:    v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT:    v_or_b32_e32 v16, v16, v10
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v24
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v25
 ; SDAG-NEXT:    v_or_b32_e32 v9, v19, v9
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v34, v2
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v36, v2
 ; SDAG-NEXT:    v_or_b32_e32 v8, v18, v8
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v35, v3, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v36, v10, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v37, v11, vcc
-; SDAG-NEXT:    v_ashrrev_i32_e32 v38, 31, v16
-; SDAG-NEXT:    v_and_b32_e32 v39, v38, v28
-; SDAG-NEXT:    v_and_b32_e32 v48, v38, v29
-; SDAG-NEXT:    v_and_b32_e32 v49, v38, v0
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v38
-; SDAG-NEXT:    v_and_b32_e32 v38, v38, v1
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v39
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v37, v3, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v38, v16, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v39, v17, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v10
+; SDAG-NEXT:    v_and_b32_e32 v25, v24, v31
+; SDAG-NEXT:    v_and_b32_e32 v48, v24, v30
+; SDAG-NEXT:    v_and_b32_e32 v49, v24, v0
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v24
+; SDAG-NEXT:    v_and_b32_e32 v50, v24, v1
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v25
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v48, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v10, v49, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v38, vcc
-; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
-; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
-; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v24, vcc, v16, v49, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, v17, v50, vcc
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, -1, v32
 ; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
-; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, -1, v34, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; SDAG-NEXT:    v_or_b32_e32 v16, v32, v34
+; SDAG-NEXT:    v_or_b32_e32 v17, v33, v35
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
-; SDAG-NEXT:    v_mov_b32_e32 v23, v17
-; SDAG-NEXT:    v_mov_b32_e32 v22, v16
+; SDAG-NEXT:    v_mov_b32_e32 v23, v11
+; SDAG-NEXT:    v_mov_b32_e32 v22, v10
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB0_3
 ; SDAG-NEXT:  ; %bb.4: ; %Flow13
@@ -196,68 +198,70 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[20:21], 1
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
 ; SDAG-NEXT:    v_or_b32_e32 v20, v19, v1
-; SDAG-NEXT:    v_or_b32_e32 v21, v17, v3
-; SDAG-NEXT:    v_or_b32_e32 v17, v18, v0
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v2
+; SDAG-NEXT:    v_or_b32_e32 v22, v11, v3
+; SDAG-NEXT:    v_or_b32_e32 v21, v18, v0
+; SDAG-NEXT:    v_or_b32_e32 v23, v10, v2
 ; SDAG-NEXT:  .LBB0_6: ; %Flow16
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
-; SDAG-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
+; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v7
+; SDAG-NEXT:    v_ashrrev_i32_e32 v17, 31, v15
+; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_mov_b32_e32 v22, v18
-; SDAG-NEXT:    v_mov_b32_e32 v23, v19
-; SDAG-NEXT:    v_xor_b32_e32 v0, v18, v7
-; SDAG-NEXT:    v_xor_b32_e32 v1, v18, v6
-; SDAG-NEXT:    v_xor_b32_e32 v3, v18, v5
-; SDAG-NEXT:    v_xor_b32_e32 v2, v18, v4
-; SDAG-NEXT:    v_xor_b32_e32 v6, v19, v15
-; SDAG-NEXT:    v_xor_b32_e32 v7, v19, v14
-; SDAG-NEXT:    v_xor_b32_e32 v8, v19, v13
-; SDAG-NEXT:    v_xor_b32_e32 v10, v19, v12
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v18
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v18, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v5, v2
-; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v1, v18, vcc
-; SDAG-NEXT:    v_add_i32_e64 v1, s[4:5], 32, v5
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v3
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v0, v18, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v18, v16
+; SDAG-NEXT:    v_mov_b32_e32 v19, v17
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, 0, v5, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v7, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v3
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v0, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v7, vcc, 0, v12
 ; SDAG-NEXT:    v_or_b32_e32 v0, v2, v4
-; SDAG-NEXT:    v_ffbh_u32_e32 v12, v4
-; SDAG-NEXT:    v_min_u32_e32 v11, v1, v11
-; SDAG-NEXT:    v_sub_i32_e32 v28, vcc, v10, v19
+; SDAG-NEXT:    v_ffbh_u32_e32 v8, v4
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v1
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v13, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v1, v3, v5
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v12
-; SDAG-NEXT:    v_ffbh_u32_e32 v12, v5
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v11
-; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v29, vcc, v8, v19, vcc
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v28
-; SDAG-NEXT:    v_min_u32_e32 v8, v10, v12
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v13, 0, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v7, v19, vcc
-; SDAG-NEXT:    v_add_i32_e64 v7, s[8:9], 32, v1
-; SDAG-NEXT:    v_ffbh_u32_e32 v12, v29
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v11, v8, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v6, v19, vcc
-; SDAG-NEXT:    v_or_b32_e32 v6, v28, v0
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
-; SDAG-NEXT:    v_min_u32_e32 v12, v7, v12
-; SDAG-NEXT:    v_or_b32_e32 v7, v29, v1
-; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT:    v_ffbh_u32_e32 v13, v1
-; SDAG-NEXT:    v_add_i32_e32 v12, vcc, 64, v12
-; SDAG-NEXT:    v_addc_u32_e64 v14, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], 32, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v30, v5
+; SDAG-NEXT:    v_min_u32_e32 v6, v10, v6
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v14, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e64 v24, v13, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v25, v12, v7, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_min_u32_e32 v1, v8, v30
+; SDAG-NEXT:    v_add_i32_e64 v6, s[8:9], 64, v6
+; SDAG-NEXT:    v_addc_u32_e64 v7, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v15, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v14, v10, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v10, v25
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v24
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v7, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, v6, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v15, v8, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v6, v25, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v8, v0
+; SDAG-NEXT:    v_add_i32_e32 v10, vcc, 32, v10
+; SDAG-NEXT:    v_or_b32_e32 v7, v24, v1
+; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 32, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v14, v1
+; SDAG-NEXT:    v_min_u32_e32 v10, v10, v11
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT:    v_min_u32_e32 v6, v11, v13
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, v14, 0, s[6:7]
-; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s[6:7]
-; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
+; SDAG-NEXT:    v_min_u32_e32 v6, v8, v14
+; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], 64, v10
+; SDAG-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v8, v12, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v10, 0x7f, v6
 ; SDAG-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v9, vcc
 ; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
@@ -272,7 +276,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; SDAG-NEXT:    v_and_b32_e32 v10, 1, v12
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
-; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v13, v5, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, v4, 0, s[4:5]
@@ -318,7 +322,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
 ; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
 ; SDAG-NEXT:    v_lshr_b64 v[37:38], v[4:5], v30
-; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v25
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
 ; SDAG-NEXT:    v_mov_b32_e32 v14, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, 0
@@ -326,7 +330,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    v_lshl_b64 v[48:49], v[4:5], v35
 ; SDAG-NEXT:    v_lshr_b64 v[4:5], v[4:5], v36
-; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v24, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v11, v11, v49
 ; SDAG-NEXT:    v_or_b32_e32 v10, v10, v48
 ; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
@@ -363,8 +367,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_and_b32_e32 v10, 1, v15
 ; SDAG-NEXT:    v_and_b32_e32 v38, v15, v1
 ; SDAG-NEXT:    v_and_b32_e32 v39, v15, v0
-; SDAG-NEXT:    v_and_b32_e32 v48, v15, v29
-; SDAG-NEXT:    v_and_b32_e32 v15, v15, v28
+; SDAG-NEXT:    v_and_b32_e32 v48, v15, v24
+; SDAG-NEXT:    v_and_b32_e32 v15, v15, v25
 ; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v15
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v48, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v39, vcc
@@ -396,14 +400,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v10, v10, v2
 ; SDAG-NEXT:  .LBB0_12: ; %Flow12
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_xor_b32_e32 v3, v27, v26
-; SDAG-NEXT:    v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT:    v_xor_b32_e32 v7, v23, v22
-; SDAG-NEXT:    v_xor_b32_e32 v6, v19, v18
+; SDAG-NEXT:    v_xor_b32_e32 v3, v29, v28
+; SDAG-NEXT:    v_xor_b32_e32 v2, v27, v26
+; SDAG-NEXT:    v_xor_b32_e32 v7, v19, v18
+; SDAG-NEXT:    v_xor_b32_e32 v6, v17, v16
 ; SDAG-NEXT:    v_xor_b32_e32 v4, v20, v3
-; SDAG-NEXT:    v_xor_b32_e32 v5, v17, v2
-; SDAG-NEXT:    v_xor_b32_e32 v1, v21, v3
-; SDAG-NEXT:    v_xor_b32_e32 v0, v16, v2
+; SDAG-NEXT:    v_xor_b32_e32 v5, v21, v2
+; SDAG-NEXT:    v_xor_b32_e32 v1, v22, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, v23, v2
 ; SDAG-NEXT:    v_xor_b32_e32 v8, v13, v7
 ; SDAG-NEXT:    v_xor_b32_e32 v9, v11, v6
 ; SDAG-NEXT:    v_xor_b32_e32 v11, v14, v7
@@ -1553,118 +1557,119 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v11
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
 ; SDAG-NEXT:    v_mov_b32_e32 v29, v28
-; SDAG-NEXT:    v_xor_b32_e32 v18, v3, v28
-; SDAG-NEXT:    v_xor_b32_e32 v19, v2, v28
-; SDAG-NEXT:    v_xor_b32_e32 v1, v1, v28
-; SDAG-NEXT:    v_xor_b32_e32 v0, v0, v28
-; SDAG-NEXT:    v_xor_b32_e32 v11, v11, v16
-; SDAG-NEXT:    v_xor_b32_e32 v10, v10, v16
-; SDAG-NEXT:    v_xor_b32_e32 v20, v9, v16
-; SDAG-NEXT:    v_xor_b32_e32 v9, v8, v16
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v0, v28
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v1, v28, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v19, v28, vcc
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v3
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v18, v28, vcc
-; SDAG-NEXT:    v_or_b32_e32 v8, v2, v0
-; SDAG-NEXT:    v_ffbh_u32_e32 v18, v0
-; SDAG-NEXT:    v_min_u32_e32 v19, v19, v21
-; SDAG-NEXT:    v_sub_i32_e32 v31, vcc, v9, v16
-; SDAG-NEXT:    v_or_b32_e32 v9, v3, v1
+; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v1, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v0, v16, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, v18, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v16
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v17
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v21, vcc, 0, v8
+; SDAG-NEXT:    v_or_b32_e32 v2, v16, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v0
 ; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 32, v18
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v1
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 64, v19
-; SDAG-NEXT:    v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v30, vcc, v20, v16, vcc
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
-; SDAG-NEXT:    v_ffbh_u32_e32 v9, v31
-; SDAG-NEXT:    v_min_u32_e32 v18, v18, v21
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, v22, 0, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v10, v16, vcc
-; SDAG-NEXT:    v_add_i32_e64 v21, s[8:9], 32, v9
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v11, v16, vcc
-; SDAG-NEXT:    v_or_b32_e32 v10, v31, v8
-; SDAG-NEXT:    v_ffbh_u32_e32 v16, v8
-; SDAG-NEXT:    v_min_u32_e32 v19, v21, v22
-; SDAG-NEXT:    v_or_b32_e32 v11, v30, v9
-; SDAG-NEXT:    v_add_i32_e32 v16, vcc, 32, v16
-; SDAG-NEXT:    v_ffbh_u32_e32 v21, v9
-; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
-; SDAG-NEXT:    v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_min_u32_e32 v10, v16, v21
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[8:9]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v22, 0, s[6:7]
-; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v19, v10, s[6:7]
-; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v10, v18
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v20, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v10
-; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v3, v17, v1
+; SDAG-NEXT:    v_add_i32_e64 v22, s[4:5], 32, v22
+; SDAG-NEXT:    v_ffbh_u32_e32 v24, v1
+; SDAG-NEXT:    v_min_u32_e32 v18, v18, v20
+; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v10, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v30, v9, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v31, v8, v21, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT:    v_min_u32_e32 v3, v22, v24
+; SDAG-NEXT:    v_add_i32_e64 v8, s[8:9], 64, v18
+; SDAG-NEXT:    v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v10, v20, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v10, v31
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v30
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v9, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v22, v8, v3, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v11, v18, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v8, v31, v2
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v2
+; SDAG-NEXT:    v_add_i32_e32 v10, vcc, 32, v10
+; SDAG-NEXT:    v_or_b32_e32 v9, v30, v3
+; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
+; SDAG-NEXT:    v_ffbh_u32_e32 v18, v3
+; SDAG-NEXT:    v_min_u32_e32 v10, v10, v20
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_min_u32_e32 v8, v11, v18
+; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 64, v10
+; SDAG-NEXT:    v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v8, v22
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, v10, v21, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v10, 0x7f, v8
+; SDAG-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v17, vcc
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v18
+; SDAG-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v19, vcc
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v18
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v11, v19
+; SDAG-NEXT:    v_or_b32_e32 v11, v9, v19
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v20
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v20
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v35, v1, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v27, v3, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v33, v2, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v27, v17, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v33, v16, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_6
 ; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
-; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v10
-; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v10
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
-; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT:    v_add_i32_e32 v32, vcc, 1, v8
+; SDAG-NEXT:    v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT:    v_mov_b32_e32 v10, 0
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[16:17], v20
 ; SDAG-NEXT:    v_addc_u32_e32 v34, vcc, 0, v18, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, 0, v19, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v18, v32, v34
-; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 0x7f, v8
 ; SDAG-NEXT:    v_or_b32_e32 v19, v33, v35
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[0:1], v24
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[0:1], v24
 ; SDAG-NEXT:    v_sub_i32_e32 v25, vcc, 64, v24
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[2:3], v24
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[16:17], v24
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT:    v_lshr_b64 v[18:19], v[2:3], v25
-; SDAG-NEXT:    v_or_b32_e32 v11, v11, v19
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v18
+; SDAG-NEXT:    v_lshr_b64 v[18:19], v[16:17], v25
+; SDAG-NEXT:    v_or_b32_e32 v9, v9, v19
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v24
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v21, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v20, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v21, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v20, v8, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, v23, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, v22, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, v1, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, v0, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_5
 ; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
-; SDAG-NEXT:    v_lshr_b64 v[16:17], v[2:3], v32
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[16:17], v32
 ; SDAG-NEXT:    v_sub_i32_e32 v26, vcc, 64, v32
 ; SDAG-NEXT:    v_subrev_i32_e32 v37, vcc, 64, v32
 ; SDAG-NEXT:    v_lshr_b64 v[24:25], v[0:1], v32
@@ -1677,43 +1682,43 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshl_b64 v[26:27], v[0:1], v26
 ; SDAG-NEXT:    v_lshr_b64 v[48:49], v[0:1], v37
 ; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v30, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v17, v27
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v26
-; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v8, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v11, v27
+; SDAG-NEXT:    v_or_b32_e32 v10, v10, v26
+; SDAG-NEXT:    v_addc_u32_e32 v38, vcc, -1, v2, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v32
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v49, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v48, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v49, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v48, v10, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v25, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v24, s[4:5]
-; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v9, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, -1, v3, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v32
-; SDAG-NEXT:    v_cndmask_b32_e32 v25, v17, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v24, v16, v2, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_cndmask_b32_e32 v25, v11, v17, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v24, v10, v16, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v11, 0
 ; SDAG-NEXT:  .LBB2_3: ; %udiv-do-while3
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-NEXT:    v_lshl_b64 v[26:27], v[26:27], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v25
+; SDAG-NEXT:    v_lshrrev_b32_e32 v10, 31, v25
 ; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v11
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v49, 31, v21
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT:    v_or_b32_e32 v26, v26, v16
+; SDAG-NEXT:    v_or_b32_e32 v26, v26, v10
 ; SDAG-NEXT:    v_or_b32_e32 v24, v24, v48
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v49
-; SDAG-NEXT:    v_or_b32_e32 v11, v19, v11
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v36, v24
-; SDAG-NEXT:    v_or_b32_e32 v10, v18, v10
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v37, v25, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v38, v26, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v39, v27, vcc
-; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
-; SDAG-NEXT:    v_and_b32_e32 v48, v16, v31
-; SDAG-NEXT:    v_and_b32_e32 v49, v16, v30
-; SDAG-NEXT:    v_and_b32_e32 v50, v16, v8
-; SDAG-NEXT:    v_and_b32_e32 v51, v16, v9
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v49
+; SDAG-NEXT:    v_or_b32_e32 v9, v19, v9
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v36, v24
+; SDAG-NEXT:    v_or_b32_e32 v8, v18, v8
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v37, v25, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v38, v26, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v39, v27, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v10, 31, v10
+; SDAG-NEXT:    v_and_b32_e32 v48, v10, v31
+; SDAG-NEXT:    v_and_b32_e32 v49, v10, v30
+; SDAG-NEXT:    v_and_b32_e32 v50, v10, v2
+; SDAG-NEXT:    v_and_b32_e32 v51, v10, v3
+; SDAG-NEXT:    v_and_b32_e32 v10, 1, v10
 ; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, v24, v48
 ; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, v25, v49, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v26, vcc, v26, v50, vcc
@@ -1728,137 +1733,138 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v21, v23, v21
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; SDAG-NEXT:    v_or_b32_e32 v20, v22, v20
-; SDAG-NEXT:    v_mov_b32_e32 v23, v17
-; SDAG-NEXT:    v_mov_b32_e32 v22, v16
+; SDAG-NEXT:    v_mov_b32_e32 v23, v11
+; SDAG-NEXT:    v_mov_b32_e32 v22, v10
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB2_3
 ; SDAG-NEXT:  ; %bb.4: ; %Flow13
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB2_5: ; %Flow14
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v21
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
-; SDAG-NEXT:    v_or_b32_e32 v10, v10, v22
-; SDAG-NEXT:    v_or_b32_e32 v35, v19, v11
-; SDAG-NEXT:    v_or_b32_e32 v27, v17, v21
-; SDAG-NEXT:    v_or_b32_e32 v32, v18, v10
-; SDAG-NEXT:    v_or_b32_e32 v33, v16, v20
+; SDAG-NEXT:    v_or_b32_e32 v8, v8, v22
+; SDAG-NEXT:    v_or_b32_e32 v35, v19, v9
+; SDAG-NEXT:    v_or_b32_e32 v27, v11, v21
+; SDAG-NEXT:    v_or_b32_e32 v32, v18, v8
+; SDAG-NEXT:    v_or_b32_e32 v33, v10, v20
 ; SDAG-NEXT:  .LBB2_6: ; %Flow16
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
-; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
 ; SDAG-NEXT:    v_mov_b32_e32 v34, v26
-; SDAG-NEXT:    v_xor_b32_e32 v10, v7, v26
-; SDAG-NEXT:    v_xor_b32_e32 v11, v6, v26
-; SDAG-NEXT:    v_xor_b32_e32 v5, v5, v26
-; SDAG-NEXT:    v_xor_b32_e32 v4, v4, v26
-; SDAG-NEXT:    v_xor_b32_e32 v15, v15, v16
-; SDAG-NEXT:    v_xor_b32_e32 v14, v14, v16
-; SDAG-NEXT:    v_xor_b32_e32 v13, v13, v16
-; SDAG-NEXT:    v_xor_b32_e32 v12, v12, v16
-; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v4, v26
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v5, v26, vcc
-; SDAG-NEXT:    v_ffbh_u32_e32 v5, v6
-; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v11, v26, vcc
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 32, v5
-; SDAG-NEXT:    v_ffbh_u32_e32 v18, v7
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v10, v26, vcc
-; SDAG-NEXT:    v_or_b32_e32 v10, v6, v4
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v4
-; SDAG-NEXT:    v_min_u32_e32 v18, v11, v18
-; SDAG-NEXT:    v_sub_i32_e32 v37, vcc, v12, v16
-; SDAG-NEXT:    v_or_b32_e32 v11, v7, v5
-; SDAG-NEXT:    v_add_i32_e64 v12, s[4:5], 32, v19
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v5
-; SDAG-NEXT:    v_add_i32_e64 v18, s[4:5], 64, v18
-; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v36, vcc, v13, v16, vcc
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, v5, v9, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, v4, v8, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, 0, v7, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v10, v8
+; SDAG-NEXT:    v_ffbh_u32_e32 v11, v9
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v19, vcc, 0, v12
+; SDAG-NEXT:    v_or_b32_e32 v6, v8, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v4
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v10
+; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v13, vcc
+; SDAG-NEXT:    v_or_b32_e32 v7, v9, v5
+; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], 32, v20
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v5
+; SDAG-NEXT:    v_min_u32_e32 v10, v10, v11
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v14, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; SDAG-NEXT:    v_cndmask_b32_e64 v36, v13, v21, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v37, v12, v19, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
+; SDAG-NEXT:    v_min_u32_e32 v7, v20, v22
+; SDAG-NEXT:    v_add_i32_e64 v10, s[8:9], 64, v10
+; SDAG-NEXT:    v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, 0, v15, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v14, v11, s[4:5]
 ; SDAG-NEXT:    v_ffbh_u32_e32 v11, v37
-; SDAG-NEXT:    v_min_u32_e32 v12, v12, v19
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, v20, 0, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v14, v16, vcc
-; SDAG-NEXT:    v_add_i32_e64 v13, s[8:9], 32, v11
 ; SDAG-NEXT:    v_ffbh_u32_e32 v14, v36
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v12, s[6:7]
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v15, v16, vcc
-; SDAG-NEXT:    v_or_b32_e32 v12, v37, v10
-; SDAG-NEXT:    v_ffbh_u32_e32 v15, v10
-; SDAG-NEXT:    v_min_u32_e32 v14, v13, v14
-; SDAG-NEXT:    v_or_b32_e32 v13, v36, v11
-; SDAG-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
-; SDAG-NEXT:    v_ffbh_u32_e32 v16, v11
-; SDAG-NEXT:    v_add_i32_e32 v14, vcc, 64, v14
-; SDAG-NEXT:    v_addc_u32_e64 v20, s[6:7], 0, 0, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v19, v10, v7, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v15, v13, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v10, v37, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v13, v6
+; SDAG-NEXT:    v_add_i32_e32 v15, vcc, 32, v11
+; SDAG-NEXT:    v_or_b32_e32 v11, v36, v7
+; SDAG-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
+; SDAG-NEXT:    v_ffbh_u32_e32 v20, v7
+; SDAG-NEXT:    v_min_u32_e32 v14, v15, v14
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_min_u32_e32 v10, v13, v20
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v14
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v13, v12, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v14, 0x7f, v10
+; SDAG-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; SDAG-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; SDAG-NEXT:    v_or_b32_e32 v15, v11, v13
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT:    v_min_u32_e32 v12, v15, v16
-; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v20, 0, s[6:7]
-; SDAG-NEXT:    s_or_b64 s[8:9], vcc, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v14, v12, s[6:7]
-; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v12, v18
-; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, v13, v19, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v12
-; SDAG-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v17, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v17, vcc
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v14
+; SDAG-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v13, v15
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v18
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_and_b32_e32 v14, 1, v18
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
+; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v19, v5, 0, s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v7, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v6, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v9, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v8, 0, s[4:5]
 ; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT:    v_add_i32_e32 v38, vcc, 1, v12
-; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v12
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
-; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, 0, v13, vcc
-; SDAG-NEXT:    v_lshl_b64 v[18:19], v[6:7], v18
-; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, 0, v14, vcc
-; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, 0, v15, vcc
-; SDAG-NEXT:    v_or_b32_e32 v13, v38, v48
-; SDAG-NEXT:    v_sub_i32_e32 v15, vcc, 0x7f, v12
-; SDAG-NEXT:    v_or_b32_e32 v14, v39, v49
-; SDAG-NEXT:    v_lshl_b64 v[20:21], v[4:5], v15
-; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, 64, v15
-; SDAG-NEXT:    v_lshl_b64 v[22:23], v[6:7], v15
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[13:14]
-; SDAG-NEXT:    v_lshr_b64 v[12:13], v[6:7], v12
-; SDAG-NEXT:    v_or_b32_e32 v13, v21, v13
-; SDAG-NEXT:    v_or_b32_e32 v12, v20, v12
-; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v15
-; SDAG-NEXT:    v_cndmask_b32_e64 v14, v19, v13, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v12, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, v23, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, v22, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; SDAG-NEXT:    v_cndmask_b32_e64 v15, v14, v5, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v14, v18, v4, s[4:5]
+; SDAG-NEXT:    v_add_i32_e32 v38, vcc, 1, v10
+; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v10
+; SDAG-NEXT:    v_mov_b32_e32 v14, 0
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
+; SDAG-NEXT:    v_addc_u32_e32 v39, vcc, 0, v11, vcc
+; SDAG-NEXT:    v_lshl_b64 v[18:19], v[8:9], v18
+; SDAG-NEXT:    v_addc_u32_e32 v48, vcc, 0, v12, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v49, vcc, 0, v13, vcc
+; SDAG-NEXT:    v_or_b32_e32 v11, v38, v48
+; SDAG-NEXT:    v_sub_i32_e32 v13, vcc, 0x7f, v10
+; SDAG-NEXT:    v_or_b32_e32 v12, v39, v49
+; SDAG-NEXT:    v_lshl_b64 v[20:21], v[4:5], v13
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, 64, v13
+; SDAG-NEXT:    v_lshl_b64 v[22:23], v[8:9], v13
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[11:12]
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[8:9], v10
+; SDAG-NEXT:    v_or_b32_e32 v11, v21, v11
+; SDAG-NEXT:    v_or_b32_e32 v10, v20, v10
+; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v13
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v19, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v18, v10, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v12, v5, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v18, v4, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_11
 ; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT:    v_lshr_b64 v[16:17], v[6:7], v38
+; SDAG-NEXT:    v_lshr_b64 v[14:15], v[8:9], v38
 ; SDAG-NEXT:    v_sub_i32_e32 v24, vcc, 64, v38
 ; SDAG-NEXT:    v_subrev_i32_e32 v51, vcc, 64, v38
 ; SDAG-NEXT:    v_lshr_b64 v[22:23], v[4:5], v38
@@ -1871,42 +1877,42 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_lshl_b64 v[24:25], v[4:5], v24
 ; SDAG-NEXT:    v_lshr_b64 v[53:54], v[4:5], v51
 ; SDAG-NEXT:    v_addc_u32_e32 v51, vcc, -1, v36, vcc
-; SDAG-NEXT:    v_or_b32_e32 v17, v17, v25
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v24
-; SDAG-NEXT:    v_addc_u32_e32 v52, vcc, -1, v10, vcc
+; SDAG-NEXT:    v_or_b32_e32 v15, v15, v25
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v24
+; SDAG-NEXT:    v_addc_u32_e32 v52, vcc, -1, v6, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v38
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v54, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v53, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v15, v54, v15, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v53, v14, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, v23, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v22, s[4:5]
-; SDAG-NEXT:    v_addc_u32_e32 v53, vcc, -1, v11, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v53, vcc, -1, v7, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v38
-; SDAG-NEXT:    v_cndmask_b32_e32 v23, v17, v7, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v22, v16, v6, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_cndmask_b32_e32 v23, v15, v9, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v22, v14, v8, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SDAG-NEXT:  .LBB2_9: ; %udiv-do-while
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SDAG-NEXT:    v_lshl_b64 v[24:25], v[24:25], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v23
+; SDAG-NEXT:    v_lshrrev_b32_e32 v14, 31, v23
 ; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v54, 31, v15
-; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT:    v_lshrrev_b32_e32 v54, 31, v13
 ; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
-; SDAG-NEXT:    v_or_b32_e32 v24, v24, v16
+; SDAG-NEXT:    v_lshrrev_b32_e32 v55, 31, v11
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_or_b32_e32 v24, v24, v14
 ; SDAG-NEXT:    v_or_b32_e32 v22, v22, v54
-; SDAG-NEXT:    v_or_b32_e32 v14, v14, v55
-; SDAG-NEXT:    v_or_b32_e32 v15, v19, v15
-; SDAG-NEXT:    v_or_b32_e32 v13, v21, v13
-; SDAG-NEXT:    v_or_b32_e32 v14, v18, v14
-; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, v50, v22
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v51, v23, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v52, v24, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v53, v25, vcc
-; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v16
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v21
-; SDAG-NEXT:    v_and_b32_e32 v54, v21, v11
-; SDAG-NEXT:    v_and_b32_e32 v55, v21, v10
+; SDAG-NEXT:    v_or_b32_e32 v12, v12, v55
+; SDAG-NEXT:    v_or_b32_e32 v13, v19, v13
+; SDAG-NEXT:    v_or_b32_e32 v11, v21, v11
+; SDAG-NEXT:    v_or_b32_e32 v12, v18, v12
+; SDAG-NEXT:    v_sub_i32_e32 v14, vcc, v50, v22
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v51, v23, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v52, v24, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v14, vcc, v53, v25, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v21, 31, v14
+; SDAG-NEXT:    v_and_b32_e32 v14, 1, v21
+; SDAG-NEXT:    v_and_b32_e32 v54, v21, v7
+; SDAG-NEXT:    v_and_b32_e32 v55, v21, v6
 ; SDAG-NEXT:    v_and_b32_e32 v40, v21, v36
 ; SDAG-NEXT:    v_and_b32_e32 v21, v21, v37
 ; SDAG-NEXT:    v_sub_i32_e32 v22, vcc, v22, v21
@@ -1921,89 +1927,87 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v54, v38, v48
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[54:55]
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT:    v_or_b32_e32 v12, v20, v12
-; SDAG-NEXT:    v_mov_b32_e32 v21, v17
-; SDAG-NEXT:    v_mov_b32_e32 v20, v16
+; SDAG-NEXT:    v_or_b32_e32 v10, v20, v10
+; SDAG-NEXT:    v_mov_b32_e32 v21, v15
+; SDAG-NEXT:    v_mov_b32_e32 v20, v14
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB2_9
 ; SDAG-NEXT:  ; %bb.10: ; %Flow
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB2_11: ; %Flow11
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v13
 ; SDAG-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
-; SDAG-NEXT:    v_or_b32_e32 v14, v14, v20
-; SDAG-NEXT:    v_or_b32_e32 v19, v19, v15
-; SDAG-NEXT:    v_or_b32_e32 v17, v17, v13
-; SDAG-NEXT:    v_or_b32_e32 v18, v18, v14
-; SDAG-NEXT:    v_or_b32_e32 v16, v16, v12
+; SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v11
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT:    v_or_b32_e32 v12, v12, v20
+; SDAG-NEXT:    v_or_b32_e32 v19, v19, v13
+; SDAG-NEXT:    v_or_b32_e32 v15, v15, v11
+; SDAG-NEXT:    v_or_b32_e32 v18, v18, v12
+; SDAG-NEXT:    v_or_b32_e32 v14, v14, v10
 ; SDAG-NEXT:  .LBB2_12: ; %Flow12
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_mul_lo_u32 v14, v33, v9
-; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0
-; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v8
-; SDAG-NEXT:    v_mul_lo_u32 v25, v35, v31
-; SDAG-NEXT:    v_mul_lo_u32 v35, v32, v30
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0
-; SDAG-NEXT:    v_mov_b32_e32 v15, 0
-; SDAG-NEXT:    v_mul_lo_u32 v38, v16, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0
-; SDAG-NEXT:    v_mul_lo_u32 v39, v17, v10
+; SDAG-NEXT:    v_mul_lo_u32 v12, v33, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v33, v2, 0
+; SDAG-NEXT:    v_mul_lo_u32 v24, v27, v2
+; SDAG-NEXT:    v_mul_lo_u32 v35, v35, v31
+; SDAG-NEXT:    v_mul_lo_u32 v38, v32, v30
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v33, 0
+; SDAG-NEXT:    v_mov_b32_e32 v13, 0
+; SDAG-NEXT:    v_mul_lo_u32 v25, v14, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v14, v6, 0
+; SDAG-NEXT:    v_mul_lo_u32 v39, v15, v6
 ; SDAG-NEXT:    v_mul_lo_u32 v19, v19, v37
 ; SDAG-NEXT:    v_mul_lo_u32 v48, v18, v36
-; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0
-; SDAG-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; SDAG-NEXT:    v_mov_b32_e32 v14, v9
-; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; SDAG-NEXT:    v_add_i32_e64 v14, s[4:5], v21, v38
-; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v24
+; SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v37, v14, 0
+; SDAG-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; SDAG-NEXT:    v_mov_b32_e32 v12, v3
+; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[12:13]
+; SDAG-NEXT:    v_sub_i32_e32 v12, vcc, v16, v2
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], v21, v25
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v24
 ; SDAG-NEXT:    v_mov_b32_e32 v24, v23
-; SDAG-NEXT:    v_mov_b32_e32 v23, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23]
-; SDAG-NEXT:    v_xor_b32_e32 v33, v2, v28
-; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v14, v39
-; SDAG-NEXT:    v_mov_b32_e32 v14, v11
-; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15]
-; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13]
-; SDAG-NEXT:    v_mov_b32_e32 v2, v9
-; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v24, v2
-; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v2, v8
-; SDAG-NEXT:    v_subb_u32_e32 v16, vcc, v3, v2, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v23, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v31, v27, v[22:23]
+; SDAG-NEXT:    v_xor_b32_e32 v33, v12, v28
+; SDAG-NEXT:    v_add_i32_e64 v21, s[4:5], v16, v39
+; SDAG-NEXT:    v_mov_b32_e32 v12, v7
+; SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v36, v14, v[12:13]
+; SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v32, v31, v[10:11]
+; SDAG-NEXT:    v_add_i32_e64 v24, s[4:5], v24, v3
+; SDAG-NEXT:    v_addc_u32_e64 v25, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v17, v2, vcc
 ; SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
-; SDAG-NEXT:    v_mov_b32_e32 v18, v23
-; SDAG-NEXT:    v_mov_b32_e32 v23, v15
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23]
-; SDAG-NEXT:    v_add_i32_e64 v20, s[4:5], v25, v12
-; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14]
-; SDAG-NEXT:    v_xor_b32_e32 v16, v16, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v23
+; SDAG-NEXT:    v_mov_b32_e32 v23, v13
+; SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v37, v15, v[22:23]
+; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v35, v11
+; SDAG-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v30, v27, v[24:25]
+; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v29
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v19, v3
-; SDAG-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v9
-; SDAG-NEXT:    v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v18, v8
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v35, v20
+; SDAG-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; SDAG-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_mov_b32_e32 v18, v12
+; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], v38, v11
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], v48, v3
-; SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15]
-; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; SDAG-NEXT:    v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
-; SDAG-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v2
-; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; SDAG-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v36, v15, v[13:14]
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v16, v10
+; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], v17, v19, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, v0, v10, vcc
+; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v2
+; SDAG-NEXT:    v_addc_u32_e64 v11, s[4:5], v12, v3, s[4:5]
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v28
 ; SDAG-NEXT:    v_xor_b32_e32 v3, v1, v29
 ; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v33, v28
-; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v16, v29, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v7, v29, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v2, v28, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v29, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v18, vcc
+; SDAG-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v9, v18, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v6, v6, v26
-; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v8, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v4, vcc, v4, v10, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v34
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v9, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v11, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v8, v4, v26
 ; SDAG-NEXT:    v_xor_b32_e32 v9, v5, v34
 ; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v6, v26
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 687188ed5ca39..d4ff845e1edf3 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -331,15 +331,14 @@ bb3:                                              ; preds = %bb3, %bb
 define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: sdiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s2
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_abs_i32 s2, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    s_sub_i32 s5, 0, s2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -350,25 +349,25 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:  .LBB2_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s5
-; GFX9-NEXT:    s_mul_i32 s7, s6, s3
-; GFX9-NEXT:    s_sub_i32 s7, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s3, s5
+; GFX9-NEXT:    s_mul_i32 s7, s6, s2
+; GFX9-NEXT:    s_sub_i32 s7, s3, s7
 ; GFX9-NEXT:    s_add_i32 s8, s6, 1
-; GFX9-NEXT:    s_sub_i32 s9, s7, s3
-; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_sub_i32 s9, s7, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX9-NEXT:    s_add_i32 s8, s6, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
-; GFX9-NEXT:    s_xor_b32 s6, s6, s2
-; GFX9-NEXT:    s_sub_i32 s6, s6, s2
-; GFX9-NEXT:    s_add_i32 s4, s4, 1
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    s_sub_i32 s6, s6, s4
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -377,12 +376,11 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x2c
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX10-NEXT:    s_abs_i32 s2, s3
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    s_add_i32 s3, s3, s2
-; GFX10-NEXT:    s_xor_b32 s3, s3, s2
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX10-NEXT:    s_sub_i32 s4, 0, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX10-NEXT:    s_sub_i32 s4, 0, s2
+; GFX10-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -395,19 +393,19 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:  .LBB2_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_mul_hi_u32 s6, s4, s5
-; GFX10-NEXT:    s_mul_i32 s7, s6, s3
+; GFX10-NEXT:    s_mul_i32 s7, s6, s2
 ; GFX10-NEXT:    s_add_i32 s8, s6, 1
 ; GFX10-NEXT:    s_sub_i32 s7, s4, s7
-; GFX10-NEXT:    s_sub_i32 s9, s7, s3
-; GFX10-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX10-NEXT:    s_sub_i32 s9, s7, s2
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX10-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX10-NEXT:    s_add_i32 s8, s6, 1
-; GFX10-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX10-NEXT:    s_add_i32 s4, s4, 1
-; GFX10-NEXT:    s_xor_b32 s6, s6, s2
-; GFX10-NEXT:    s_sub_i32 s6, s6, s2
+; GFX10-NEXT:    s_xor_b32 s6, s6, s3
+; GFX10-NEXT:    s_sub_i32 s6, s6, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
@@ -425,22 +423,20 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b32 s3, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_ashr_i32 s2, s3, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s3, s3, s2
-; GFX11-NEXT:    s_xor_b32 s3, s3, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX11-NEXT:    s_sub_i32 s4, 0, s3
+; GFX11-NEXT:    s_abs_i32 s2, s3
+; GFX11-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT:    s_sub_i32 s4, 0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s4, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s6, s5, s4
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_add_i32 s5, s5, s6
@@ -449,21 +445,21 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s6, s4, s5
-; GFX11-NEXT:    s_mul_i32 s7, s6, s3
+; GFX11-NEXT:    s_mul_i32 s7, s6, s2
 ; GFX11-NEXT:    s_add_i32 s8, s6, 1
 ; GFX11-NEXT:    s_sub_i32 s7, s4, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s9, s7, s3
-; GFX11-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX11-NEXT:    s_sub_i32 s9, s7, s2
+; GFX11-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX11-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX11-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX11-NEXT:    s_add_i32 s8, s6, 1
-; GFX11-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX11-NEXT:    s_cmp_ge_u32 s7, s2
 ; GFX11-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX11-NEXT:    s_add_i32 s4, s4, 1
-; GFX11-NEXT:    s_xor_b32 s6, s6, s2
+; GFX11-NEXT:    s_xor_b32 s6, s6, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s6, s6, s2
+; GFX11-NEXT:    s_sub_i32 s6, s6, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
@@ -495,14 +491,12 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: srem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s3, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_abs_i32 s2, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s2
-; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -524,7 +518,6 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX9-NEXT:    s_add_i32 s3, s3, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -537,10 +530,8 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX10-NEXT:    s_abs_i32 s2, s2
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT:    s_add_i32 s2, s2, s3
-; GFX10-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -581,10 +572,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s2, s2, s3
-; GFX11-NEXT:    s_xor_b32 s2, s2, s3
+; GFX11-NEXT:    s_abs_i32 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
index f950717c591a9..3e6de32492457 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
@@ -17,15 +17,16 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_14
 ; GCN-NEXT:  ; %bb.1: ; %itofp-if-end
-; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GCN-NEXT:    v_xor_b32_e32 v0, v5, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, v5, v1
-; GCN-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
-; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
-; GCN-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v6, v5, v3
-; GCN-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
-; GCN-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GCN-NEXT:    v_sub_co_u32_e32 v4, vcc, 0, v0
+; GCN-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
+; GCN-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GCN-NEXT:    ; implicit-def: $vgpr8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; GCN-NEXT:    v_ffbh_u32_e32 v2, v4
 ; GCN-NEXT:    v_add_u32_e32 v2, 32, v2
 ; GCN-NEXT:    v_ffbh_u32_e32 v6, v5
@@ -40,7 +41,6 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
 ; GCN-NEXT:    v_sub_u32_e32 v6, 0x80, v7
 ; GCN-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
-; GCN-NEXT:    ; implicit-def: $vgpr8
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:  ; %bb.2: ; %itofp-if-else
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index c6aa2182aec80..c5198cdb421a5 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -13,15 +13,16 @@ define float @sitofp_i128_to_f32(i128 %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_14
 ; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
-; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
-; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
-; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
-; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
-; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
-; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, 0, v0
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
 ; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
 ; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
@@ -36,7 +37,6 @@ define float @sitofp_i128_to_f32(i128 %x) {
 ; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
 ; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
 ; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
-; SDAG-NEXT:    ; implicit-def: $vgpr8
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
@@ -524,16 +524,17 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_14
 ; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
-; SDAG-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; SDAG-NEXT:    v_xor_b32_e32 v4, v0, v4
-; SDAG-NEXT:    v_xor_b32_e32 v5, v0, v5
-; SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v0
-; SDAG-NEXT:    v_xor_b32_e32 v2, v0, v2
-; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v0, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v1, v0, v3
-; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v0, vcc
-; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v0, vcc
+; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v4
+; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr10
+; SDAG-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v7, v3, v7, vcc
 ; SDAG-NEXT:    v_ffbh_u32_e32 v0, v6
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
 ; SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
 ; SDAG-NEXT:    v_ffbh_u32_e32 v1, v7
 ; SDAG-NEXT:    v_min_u32_e32 v0, v0, v1
@@ -547,7 +548,6 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; SDAG-NEXT:    v_sub_u32_e32 v8, 0x80, v9
 ; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v9
 ; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 54, v8
-; SDAG-NEXT:    ; implicit-def: $vgpr10
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -1103,15 +1103,16 @@ define half @sitofp_i128_to_f16(i128 %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB4_14
 ; SDAG-NEXT:  ; %bb.1: ; %itofp-if-end
-; SDAG-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; SDAG-NEXT:    v_xor_b32_e32 v0, v5, v0
-; SDAG-NEXT:    v_xor_b32_e32 v1, v5, v1
-; SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
-; SDAG-NEXT:    v_xor_b32_e32 v2, v5, v2
-; SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v6, v5, v3
-; SDAG-NEXT:    v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
-; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, 0, v0
+; SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
+; SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    ; implicit-def: $vgpr8
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SDAG-NEXT:    v_ffbh_u32_e32 v2, v4
 ; SDAG-NEXT:    v_add_u32_e32 v2, 32, v2
 ; SDAG-NEXT:    v_ffbh_u32_e32 v6, v5
@@ -1126,7 +1127,6 @@ define half @sitofp_i128_to_f16(i128 %x) {
 ; SDAG-NEXT:    v_sub_u32_e32 v6, 0x80, v7
 ; SDAG-NEXT:    v_sub_u32_e32 v2, 0x7f, v7
 ; SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 25, v6
-; SDAG-NEXT:    ; implicit-def: $vgpr8
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SDAG-NEXT:  ; %bb.2: ; %itofp-if-else
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index b068d87c4d6f4..6b036f675929e 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -10,29 +10,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_srem_i128_vv:
 ; GFX9:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v20, 31, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v20
-; GFX9-NEXT:    v_xor_b32_e32 v10, v2, v20
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v20
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v20
-; GFX9-NEXT:    v_xor_b32_e32 v9, v3, v20
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v20, vcc
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v10, v20, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v9, v20, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v8
-; GFX9-NEXT:    v_sub_co_u32_e32 v23, vcc, v4, v8
-; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v21, vcc, v5, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
-; GFX9-NEXT:    v_or_b32_e32 v7, v21, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v21, v20
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v5, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v4, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc
+; GFX9-NEXT:    v_or_b32_e32 v7, v22, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v23, v4
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v7, v3, v1
-; GFX9-NEXT:    v_or_b32_e32 v6, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v6, v0, v2
 ; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
@@ -40,38 +42,37 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX9-NEXT:    v_ffbh_u32_e32 v7, v23
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v21
+; GFX9-NEXT:    v_ffbh_u32_e32 v8, v22
 ; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 64, v7
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT:    v_ffbh_u32_e32 v9, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v2
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v9
-; GFX9-NEXT:    v_ffbh_u32_e32 v9, v2
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_ffbh_u32_e32 v10, v3
-; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v10
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v0
+; GFX9-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v1
+; GFX9-NEXT:    v_min_u32_e32 v10, v10, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, 64, v10
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v11, vcc
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
 ; GFX9-NEXT:    v_or_b32_e32 v13, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT:    v_mov_b32_e32 v22, v20
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
@@ -82,10 +83,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v1, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v0, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v3, 0, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, v2, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v1, 0, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v0, 0, s[4:5]
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_6
@@ -98,22 +99,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 64, v13
 ; GFX9-NEXT:    v_or_b32_e32 v8, v25, v27
 ; GFX9-NEXT:    v_or_b32_e32 v7, v24, v26
-; GFX9-NEXT:    v_lshlrev_b64 v[9:10], v13, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b64 v[11:12], v11, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[9:10], v13, v[2:3]
+; GFX9-NEXT:    v_lshrrev_b64 v[11:12], v11, v[0:1]
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 63, v6
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v8, v10, v12
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v13
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v13
-; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v13, v[2:3]
+; GFX9-NEXT:    v_lshlrev_b64 v[12:13], v13, v[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v1, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, v13, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v11, 0
@@ -123,23 +124,23 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 64, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
 ; GFX9-NEXT:    v_or_b32_e32 v10, v8, v10
 ; GFX9-NEXT:    v_subrev_u32_e32 v8, 64, v24
 ; GFX9-NEXT:    v_or_b32_e32 v11, v9, v11
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v8, v[2:3]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v24
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v15, v9, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v9, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v8, v10, vcc
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, v10, v2, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], v24, v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v10, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v17, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v16, 0, v8, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v28, vcc, -1, v23
-; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v21, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v22, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, -1, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v18, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
@@ -168,7 +169,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_or_b32_e32 v12, v18, v12
 ; GFX9-NEXT:    v_and_b32_e32 v18, v8, v23
 ; GFX9-NEXT:    v_or_b32_e32 v13, v19, v13
-; GFX9-NEXT:    v_and_b32_e32 v19, v8, v21
+; GFX9-NEXT:    v_and_b32_e32 v19, v8, v22
 ; GFX9-NEXT:    v_sub_co_u32_e32 v14, vcc, v14, v18
 ; GFX9-NEXT:    v_and_b32_e32 v32, v8, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v15, vcc, v15, v19, vcc
@@ -207,7 +208,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v4, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v14, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v21, v13, v[14:15]
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v22, v13, v[14:15]
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v10, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v23
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v14
@@ -218,272 +219,283 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v14
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v8
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v12, v12, v21
-; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v21, v10, v[8:9]
+; GFX9-NEXT:    v_mul_lo_u32 v12, v12, v22
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v22, v10, v[8:9]
 ; GFX9-NEXT:    v_add3_u32 v4, v11, v7, v12
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v13
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v6, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, v0, v20
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v20
-; GFX9-NEXT:    v_xor_b32_e32 v4, v1, v22
-; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v22
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v20
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v21
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v20
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v22, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v20, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v22, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v20
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v21, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v21
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v20, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v21, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O0-LABEL: v_srem_i128_vv:
 ; GFX9-O0:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v7
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v3
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[11:12], s4, v[10:11]
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v13
+; GFX9-O0-NEXT:    v_ashrrev_i64 v[3:4], s4, v[3:4]
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v6
-; GFX9-O0-NEXT:    v_ashrrev_i64 v[15:16], s4, v[13:14]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v12
-; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v11
-; GFX9-O0-NEXT:    v_xor_b32_e64 v13, v8, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-O0-NEXT:    v_xor_b32_e64 v3, v3, v10
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v1, v1, v12
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v16
-; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v8, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v15
-; GFX9-O0-NEXT:    v_xor_b32_e64 v7, v7, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_xor_b32_e64 v9, v9, v3
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v14
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v9, vcc, v9, v12
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v11, v12, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
+; GFX9-O0-NEXT:    s_mov_b32 s10, s6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s10, 2
+; GFX9-O0-NEXT:    s_mov_b32 s11, s7
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s11, 3
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, s10, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v4, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v2, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[12:13], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v1
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-O0-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v6
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v3, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v5, v6, vcc
-; GFX9-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v2, v3, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v19
+; GFX9-O0-NEXT:    v_sub_co_u32_e32 v14, vcc, s10, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v10, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s10
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v8, vcc
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s11
+; GFX9-O0-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v9, vcc
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[18:19], s[4:5]
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v14
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[4:5]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v12
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 killed $vgpr11_vgpr12 killed $exec
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
-; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v3
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v12
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v19
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v14
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9-O0-NEXT:    v_or_b32_e64 v3, v8, v7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    v_or_b32_e64 v1, v5, v6
-; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 1
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[1:2], s[6:7]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v10
-; GFX9-O0-NEXT:    v_or_b32_e64 v15, v4, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v9
-; GFX9-O0-NEXT:    v_or_b32_e64 v9, v3, v1
-; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v15
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[9:10], s[6:7]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v19
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[13:14], s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX9-O0-NEXT:    v_or_b32_e64 v15, v13, v14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v16
+; GFX9-O0-NEXT:    v_or_b32_e64 v13, v13, v14
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[13:14], s[6:7]
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v6
-; GFX9-O0-NEXT:    s_mov_b32 s9, 32
-; GFX9-O0-NEXT:    v_add_u32_e64 v6, v6, s9
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
-; GFX9-O0-NEXT:    v_min_u32_e64 v6, v6, v7
-; GFX9-O0-NEXT:    s_mov_b32 s8, 0
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
-; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v5
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[11:12], s[8:9]
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v8, v8
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v8
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    s_mov_b64 s[10:11], 64
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v15
-; GFX9-O0-NEXT:    s_mov_b32 s12, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
-; GFX9-O0-NEXT:    s_mov_b32 s14, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[12:13], v8, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s14
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v9, s[12:13]
+; GFX9-O0-NEXT:    s_mov_b32 s13, 32
+; GFX9-O0-NEXT:    v_add_u32_e64 v8, v8, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v9, v9
+; GFX9-O0-NEXT:    v_min_u32_e64 v8, v8, v9
+; GFX9-O0-NEXT:    s_mov_b32 s12, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    s_mov_b64 s[12:13], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[12:13], v[11:12], s[12:13]
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[12:13]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v6, v7, s[12:13]
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX9-O0-NEXT:    v_add_u32_e64 v7, v7, s13
+; GFX9-O0-NEXT:    v_ffbh_u32_e64 v10, v10
+; GFX9-O0-NEXT:    v_min_u32_e64 v13, v7, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[14:15], 64
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v13
+; GFX9-O0-NEXT:    s_mov_b32 s16, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
+; GFX9-O0-NEXT:    s_mov_b32 s18, s15
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v10, s[16:17], v10, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s18
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v7, s[16:17], v7, v11, s[16:17]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v10
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[8:9]
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v7
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[5:6], s[8:9]
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v1
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v6, v2
 ; GFX9-O0-NEXT:    v_min_u32_e64 v6, v5, v6
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v7
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v5, v3
-; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s9
+; GFX9-O0-NEXT:    v_add_u32_e64 v5, v5, s13
 ; GFX9-O0-NEXT:    v_ffbh_u32_e64 v11, v4
-; GFX9-O0-NEXT:    v_min_u32_e64 v15, v5, v11
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s8
-; GFX9-O0-NEXT:    ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v15
-; GFX9-O0-NEXT:    s_mov_b32 s8, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v16
-; GFX9-O0-NEXT:    s_mov_b32 s10, s11
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[8:9], v11, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s10
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[8:9], v5, v12, s[8:9]
+; GFX9-O0-NEXT:    v_min_u32_e64 v12, v5, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v12
+; GFX9-O0-NEXT:    s_mov_b32 s12, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX9-O0-NEXT:    s_mov_b32 s14, s15
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v11, s[12:13], v11, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s14
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v5, v12, s[12:13]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[13:14], s[8:9]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[8:9]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
@@ -496,8 +508,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    s_mov_b32 s10, s6
-; GFX9-O0-NEXT:    s_mov_b32 s11, s7
 ; GFX9-O0-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v6, v7, vcc
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s10
@@ -580,75 +590,75 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 3
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s4, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s5, 5
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX9-O0-NEXT:    s_branch .LBB0_8
 ; GFX9-O0-NEXT:  .LBB0_1: ; %Flow
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 7
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:  ; %bb.2: ; %Flow
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_5
 ; GFX9-O0-NEXT:  .LBB0_3: ; %Flow2
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v4, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v4, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_9
 ; GFX9-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[2:3], s4, v[0:1]
@@ -681,67 +691,67 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_3
 ; GFX9-O0-NEXT:  .LBB0_5: ; %Flow1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_4
 ; GFX9-O0-NEXT:  .LBB0_6: ; %udiv-do-while
 ; GFX9-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 8
-; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 9
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v16, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v16, 11
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 63
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[29:30], s4, v[2:3]
@@ -881,72 +891,72 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v2
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v0
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v15
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v14
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v13
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v12
-; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 7
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s6, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s7, 11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execnz .LBB0_6
 ; GFX9-O0-NEXT:    s_branch .LBB0_1
 ; GFX9-O0-NEXT:  .LBB0_7: ; %udiv-preheader
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
@@ -1027,51 +1037,51 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v17
 ; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 8
-; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 9
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s4, 10
+; GFX9-O0-NEXT:    v_writelane_b32 v16, s5, 11
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
+; GFX9-O0-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_branch .LBB0_6
 ; GFX9-O0-NEXT:  .LBB0_8: ; %udiv-bb1
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -1108,14 +1118,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 0x7f
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v3, s4, v4
 ; GFX9-O0-NEXT:    v_lshlrev_b64 v[5:6], v3, v[11:12]
@@ -1161,12 +1171,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX9-O0-NEXT:    v_or_b32_e64 v3, v3, v4
@@ -1181,33 +1191,33 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-O0-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-O0-NEXT:    s_xor_b64 s[6:7], s[4:5], s[6:7]
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 7
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s6, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s7, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_cbranch_execz .LBB0_5
 ; GFX9-O0-NEXT:    s_branch .LBB0_7
 ; GFX9-O0-NEXT:  .LBB0_9: ; %udiv-end
-; GFX9-O0-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[22:23], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[22:23]
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
@@ -1218,10 +1228,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b32 s4, 32
@@ -1495,11 +1505,11 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr4
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index ea30a63b0be19..6372d74161fad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -28,34 +28,33 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
-; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
+; GCN-NEXT:    v_max_i32_e32 v2, v1, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v5
+; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v0, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
+; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v1, v2
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -73,34 +72,33 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_mov_b32 s0, s4
 ; TONGA-NEXT:    s_mov_b32 s1, s5
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v2
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v1
-; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, 0, v1
+; TONGA-NEXT:    v_max_i32_e32 v2, v1, v2
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
+; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v5
+; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
+; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; TONGA-NEXT:    v_mul_hi_u32 v3, v0, v3
-; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v0, v1
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
+; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v1, v2
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v2
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v0
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -115,40 +113,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX9-NEXT:    s_mov_b32 s8, s6
 ; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s6
-; GFX9-NEXT:    s_xor_b32 s7, s0, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX9-NEXT:    s_abs_i32 s7, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX9-NEXT:    s_xor_b32 s5, s4, s6
+; GFX9-NEXT:    s_sub_i32 s6, 0, s7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s5
-; GFX9-NEXT:    s_xor_b32 s6, s5, s6
-; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    s_abs_i32 s4, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s5, 0, s7
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s5
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
-; GFX9-NEXT:    s_mul_i32 s8, s5, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s8, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s8
+; GFX9-NEXT:    s_mul_i32 s8, s6, s7
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s8
-; GFX9-NEXT:    s_add_i32 s9, s5, 1
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
 ; GFX9-NEXT:    s_sub_i32 s8, s4, s7
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
-; GFX9-NEXT:    s_cselect_b32 s4, s8, s5
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
-; GFX9-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s6
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -408,62 +403,60 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GCN-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
-; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
+; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT:    v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT:    v_xor_b32_e32 v7, v1, v3
+; GCN-NEXT:    v_max_i32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_i32_e32 v3, v3, v9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v3
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT:    v_max_i32_e32 v0, v0, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GCN-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
+; GCN-NEXT:    v_mul_lo_u32 v9, v9, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
-; GCN-NEXT:    v_mul_lo_u32 v11, v11, v7
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v5, v10
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v8
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v10
+; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
 ; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v1, v3
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -481,62 +474,60 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_mov_b32 s4, s0
 ; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v5
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
-; TONGA-NEXT:    v_xor_b32_e32 v8, v4, v5
-; TONGA-NEXT:    v_xor_b32_e32 v9, v6, v7
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v2
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
+; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v3
+; TONGA-NEXT:    v_xor_b32_e32 v4, v0, v2
+; TONGA-NEXT:    v_xor_b32_e32 v7, v1, v3
+; TONGA-NEXT:    v_max_i32_e32 v2, v2, v6
+; TONGA-NEXT:    v_max_i32_e32 v3, v3, v9
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v3
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT:    v_max_i32_e32 v0, v0, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; TONGA-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; TONGA-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v3
+; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v6
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
-; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v7
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v6
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
-; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v10
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
-; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v7, v6
-; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, 0, v1
+; TONGA-NEXT:    v_mul_hi_u32 v9, v6, v9
+; TONGA-NEXT:    v_max_i32_e32 v1, v1, v8
+; TONGA-NEXT:    v_mul_hi_u32 v8, v5, v10
+; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
+; TONGA-NEXT:    v_mul_hi_u32 v6, v0, v6
 ; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
+; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; TONGA-NEXT:    v_mul_lo_u32 v8, v6, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v7, vcc, v1, v3
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
+; TONGA-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v1, v3
 ; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v7
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v7
 ; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -553,69 +544,63 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s6, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    s_abs_i32 s1, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s1
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s8
+; GFX9-NEXT:    s_xor_b32 s0, s7, s0
+; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    s_xor_b32 s9, s8, s1
-; GFX9-NEXT:    s_xor_b32 s1, s7, s8
-; GFX9-NEXT:    s_sub_i32 s7, 0, s6
+; GFX9-NEXT:    s_sub_i32 s0, 0, s1
+; GFX9-NEXT:    s_abs_i32 s7, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s0, s9, s0
+; GFX9-NEXT:    s_add_i32 s9, s9, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s7, s9
+; GFX9-NEXT:    s_mul_i32 s9, s0, s1
+; GFX9-NEXT:    s_sub_i32 s7, s7, s9
+; GFX9-NEXT:    s_add_i32 s10, s0, 1
+; GFX9-NEXT:    s_sub_i32 s9, s7, s1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX9-NEXT:    s_add_i32 s9, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s1
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s0
+; GFX9-NEXT:    s_abs_i32 s9, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX9-NEXT:    s_mov_b32 s0, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s5, s4, s6
+; GFX9-NEXT:    s_xor_b32 s6, s7, s8
+; GFX9-NEXT:    s_sub_i32 s7, 0, s9
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9-NEXT:    s_abs_i32 s4, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX9-NEXT:    s_mul_i32 s7, s7, s8
 ; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
 ; GFX9-NEXT:    s_add_i32 s8, s8, s7
-; GFX9-NEXT:    s_mul_hi_u32 s7, s1, s8
-; GFX9-NEXT:    s_mul_i32 s8, s7, s6
-; GFX9-NEXT:    s_sub_i32 s1, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s8
+; GFX9-NEXT:    s_mul_i32 s8, s7, s9
+; GFX9-NEXT:    s_sub_i32 s4, s4, s8
 ; GFX9-NEXT:    s_add_i32 s10, s7, 1
-; GFX9-NEXT:    s_sub_i32 s8, s1, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX9-NEXT:    s_sub_i32 s8, s4, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s9
 ; GFX9-NEXT:    s_cselect_b32 s7, s10, s7
-; GFX9-NEXT:    s_cselect_b32 s1, s8, s1
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
 ; GFX9-NEXT:    s_add_i32 s8, s7, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s8, s7
-; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_xor_b32 s4, s4, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s6, s9
-; GFX9-NEXT:    s_add_i32 s5, s5, s8
-; GFX9-NEXT:    s_xor_b32 s7, s8, s7
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s6, s6, s9
-; GFX9-NEXT:    s_xor_b32 s5, s5, s8
-; GFX9-NEXT:    s_sub_i32 s8, 0, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
-; GFX9-NEXT:    s_mul_i32 s8, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s8, s9, s8
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_mul_hi_u32 s8, s5, s9
-; GFX9-NEXT:    s_mul_i32 s9, s8, s4
-; GFX9-NEXT:    s_sub_i32 s5, s5, s9
-; GFX9-NEXT:    s_add_i32 s10, s8, 1
-; GFX9-NEXT:    s_sub_i32 s9, s5, s4
-; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
-; GFX9-NEXT:    s_add_i32 s9, s8, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s8
-; GFX9-NEXT:    s_xor_b32 s4, s4, s7
-; GFX9-NEXT:    s_sub_i32 s4, s4, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s9
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s7
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -819,119 +804,115 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mov_b32 s8, s0
 ; GCN-NEXT:    s_mov_b32 s9, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
-; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
-; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
-; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
-; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
-; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
+; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
+; GCN-NEXT:    v_xor_b32_e32 v8, v0, v4
+; GCN-NEXT:    v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT:    v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT:    v_max_i32_e32 v4, v4, v10
+; GCN-NEXT:    v_max_i32_e32 v5, v5, v13
+; GCN-NEXT:    v_max_i32_e32 v6, v6, v16
+; GCN-NEXT:    v_max_i32_e32 v1, v1, v12
+; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v14
+; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v6
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GCN-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
+; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
+; GCN-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
-; GCN-NEXT:    v_mul_lo_u32 v9, v9, v8
-; GCN-NEXT:    v_mul_lo_u32 v11, v11, v10
-; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v6
+; GCN-NEXT:    v_cvt_u32_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_u32_f32_e32 v16, v16
+; GCN-NEXT:    v_sub_i32_e32 v17, vcc, 0, v7
+; GCN-NEXT:    v_max_i32_e32 v0, v0, v9
+; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v11
+; GCN-NEXT:    v_max_i32_e32 v2, v2, v15
+; GCN-NEXT:    v_max_i32_e32 v11, v7, v17
+; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v4
+; GCN-NEXT:    v_sub_i32_e32 v15, vcc, 0, v5
+; GCN-NEXT:    v_sub_i32_e32 v17, vcc, 0, v6
 ; GCN-NEXT:    v_mul_lo_u32 v13, v13, v12
-; GCN-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GCN-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GCN-NEXT:    v_mul_lo_u32 v15, v15, v14
+; GCN-NEXT:    v_mul_lo_u32 v17, v17, v16
+; GCN-NEXT:    v_cvt_f32_u32_e32 v18, v11
 ; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v18, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v11
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v12, v13
-; GCN-NEXT:    v_mul_hi_u32 v10, v2, v10
+; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
+; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v18, v18
-; GCN-NEXT:    v_mul_lo_u32 v11, v8, v4
-; GCN-NEXT:    v_mul_lo_u32 v13, v9, v5
-; GCN-NEXT:    v_mul_lo_u32 v21, v10, v6
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, v14, v15
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, v16, v17
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v12
+; GCN-NEXT:    v_mul_hi_u32 v13, v1, v13
+; GCN-NEXT:    v_mul_hi_u32 v14, v2, v14
 ; GCN-NEXT:    v_mul_f32_e32 v18, 0x4f7ffffe, v18
+; GCN-NEXT:    v_mul_lo_u32 v15, v12, v4
+; GCN-NEXT:    v_mul_lo_u32 v17, v13, v5
+; GCN-NEXT:    v_mul_lo_u32 v21, v14, v6
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v18, v18
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v13
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 1, v9
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v15
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v21
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 1, v12
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 1, v13
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 1, v14
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v21
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v1, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v20, s[2:3]
-; GCN-NEXT:    v_sub_i32_e32 v19, vcc, 0, v7
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 1, v10
-; GCN-NEXT:    v_sub_i32_e32 v13, vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v9
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GCN-NEXT:    v_sub_i32_e32 v19, vcc, 0, v11
+; GCN-NEXT:    v_sub_i32_e32 v17, vcc, v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v16, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v16, vcc, v1, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v20, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; GCN-NEXT:    v_mul_lo_u32 v19, v19, v18
+; GCN-NEXT:    v_sub_i32_e32 v20, vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v17, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v16, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 1, v14
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v19, v18
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v8, v11, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v12, v15, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GCN-NEXT:    v_mul_hi_u32 v4, v18, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v18, v4
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v22, s[4:5]
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v15
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v16
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s[4:5]
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v15
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v16
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
+; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v13, v16, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GCN-NEXT:    v_mul_hi_u32 v4, v18, v19
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v20, s[4:5]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v4, v7
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_xor_b32_e32 v6, v8, v14
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, v3, v7
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v14, v17, vcc
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; GCN-NEXT:    v_max_i32_e32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v18, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v5, v4
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v11
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v5, v11
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v11
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v11
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -950,119 +931,115 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_mov_b32 s8, s0
 ; TONGA-NEXT:    s_mov_b32 s9, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(1)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
-; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v11
-; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v10
-; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
-; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
-; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
-; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v6, v13
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v5
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v12
-; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
-; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v5
+; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v6
+; TONGA-NEXT:    v_xor_b32_e32 v8, v0, v4
+; TONGA-NEXT:    v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT:    v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT:    v_max_i32_e32 v4, v4, v10
+; TONGA-NEXT:    v_max_i32_e32 v5, v5, v13
+; TONGA-NEXT:    v_max_i32_e32 v6, v6, v16
+; TONGA-NEXT:    v_max_i32_e32 v1, v1, v12
+; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v14
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v14, v5
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v16, v6
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v0
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; TONGA-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v14, v14
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v16, v16
+; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, 0, v2
 ; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
+; TONGA-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
+; TONGA-NEXT:    v_mul_f32_e32 v16, 0x4f7ffffe, v16
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v5
-; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v8
-; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v10
-; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v6
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v14, v14
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v16, v16
+; TONGA-NEXT:    v_sub_u32_e32 v17, vcc, 0, v7
+; TONGA-NEXT:    v_max_i32_e32 v0, v0, v9
+; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v11
+; TONGA-NEXT:    v_max_i32_e32 v2, v2, v15
+; TONGA-NEXT:    v_max_i32_e32 v11, v7, v17
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v4
+; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, 0, v5
+; TONGA-NEXT:    v_sub_u32_e32 v17, vcc, 0, v6
 ; TONGA-NEXT:    v_mul_lo_u32 v13, v13, v12
-; TONGA-NEXT:    v_mul_hi_u32 v9, v8, v9
-; TONGA-NEXT:    v_mul_hi_u32 v11, v10, v11
-; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v7, v14
+; TONGA-NEXT:    v_mul_lo_u32 v15, v15, v14
+; TONGA-NEXT:    v_mul_lo_u32 v17, v17, v16
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v18, v11
 ; TONGA-NEXT:    v_mul_hi_u32 v13, v12, v13
-; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v18, v7
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v10, v11
-; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
-; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
-; TONGA-NEXT:    v_mul_hi_u32 v10, v2, v10
+; TONGA-NEXT:    v_mul_hi_u32 v15, v14, v15
+; TONGA-NEXT:    v_mul_hi_u32 v17, v16, v17
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v18, v18
-; TONGA-NEXT:    v_mul_lo_u32 v11, v8, v4
-; TONGA-NEXT:    v_mul_lo_u32 v13, v9, v5
-; TONGA-NEXT:    v_mul_lo_u32 v21, v10, v6
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, v12, v13
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v14, v15
+; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v16, v17
+; TONGA-NEXT:    v_mul_hi_u32 v12, v0, v12
+; TONGA-NEXT:    v_mul_hi_u32 v13, v1, v13
+; TONGA-NEXT:    v_mul_hi_u32 v14, v2, v14
 ; TONGA-NEXT:    v_mul_f32_e32 v18, 0x4f7ffffe, v18
+; TONGA-NEXT:    v_mul_lo_u32 v15, v12, v4
+; TONGA-NEXT:    v_mul_lo_u32 v17, v13, v5
+; TONGA-NEXT:    v_mul_lo_u32 v21, v14, v6
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v18, v18
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v11
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v13
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
-; TONGA-NEXT:    v_add_u32_e32 v20, vcc, 1, v9
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v15
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v17
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v21
+; TONGA-NEXT:    v_add_u32_e32 v16, vcc, 1, v12
+; TONGA-NEXT:    v_add_u32_e32 v20, vcc, 1, v13
+; TONGA-NEXT:    v_add_u32_e32 v15, vcc, 1, v14
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
-; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v21
-; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v0, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v1, v5
-; TONGA-NEXT:    v_cndmask_b32_e64 v9, v9, v20, s[2:3]
-; TONGA-NEXT:    v_sub_u32_e32 v19, vcc, 0, v7
-; TONGA-NEXT:    v_add_u32_e32 v22, vcc, 1, v10
-; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v12, s[2:3]
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v9
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; TONGA-NEXT:    v_sub_u32_e32 v19, vcc, 0, v11
+; TONGA-NEXT:    v_sub_u32_e32 v17, vcc, v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v12, v12, v16, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, v1, v5
+; TONGA-NEXT:    v_cndmask_b32_e64 v13, v13, v20, s[2:3]
+; TONGA-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[4:5]
+; TONGA-NEXT:    v_mul_lo_u32 v19, v19, v18
+; TONGA-NEXT:    v_sub_u32_e32 v20, vcc, v2, v6
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v17, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v15, vcc, 1, v12
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v16, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
+; TONGA-NEXT:    v_add_u32_e32 v17, vcc, 1, v14
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; TONGA-NEXT:    v_mul_lo_u32 v4, v19, v18
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v8, v11, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v12, v15, vcc
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; TONGA-NEXT:    v_mul_hi_u32 v4, v18, v4
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v8
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v18, v4
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v10, v10, v22, s[4:5]
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v15
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v16
-; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v13, s[4:5]
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v15
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v16
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v10
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v13, v16, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
+; TONGA-NEXT:    v_mul_hi_u32 v4, v18, v19
+; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v20, s[4:5]
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v5, vcc
-; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v7
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
-; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v17
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
-; TONGA-NEXT:    v_xor_b32_e32 v6, v8, v14
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
-; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v3, v7
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; TONGA-NEXT:    v_cndmask_b32_e32 v2, v14, v17, vcc
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v3
+; TONGA-NEXT:    v_max_i32_e32 v5, v3, v5
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v18, v4
+; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v4
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v10
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
+; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v11
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v5, v6
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
+; TONGA-NEXT:    v_sub_u32_e32 v7, vcc, v5, v11
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v11
+; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v11
+; TONGA-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v3
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v4, v3
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -1078,138 +1055,126 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    s_add_i32 s1, s1, s4
-; GFX9-NEXT:    s_xor_b32 s6, s1, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_abs_i32 s1, s0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v4
-; GFX9-NEXT:    s_ashr_i32 s9, s8, 31
-; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX9-NEXT:    s_xor_b32 s0, s7, s0
+; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s4, s9, s4
-; GFX9-NEXT:    s_xor_b32 s8, s8, s9
-; GFX9-NEXT:    s_sub_i32 s9, 0, s6
+; GFX9-NEXT:    s_sub_i32 s0, 0, s1
+; GFX9-NEXT:    s_abs_i32 s7, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v0
-; GFX9-NEXT:    s_mul_i32 s9, s9, s10
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s9
-; GFX9-NEXT:    s_add_i32 s10, s10, s9
-; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s10
-; GFX9-NEXT:    s_mul_i32 s10, s9, s6
-; GFX9-NEXT:    s_sub_i32 s8, s8, s10
-; GFX9-NEXT:    s_add_i32 s11, s9, 1
-; GFX9-NEXT:    s_sub_i32 s10, s8, s6
-; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
-; GFX9-NEXT:    s_cselect_b32 s9, s11, s9
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX9-NEXT:    s_add_i32 s10, s9, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
-; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
-; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX9-NEXT:    s_add_i32 s7, s7, s8
-; GFX9-NEXT:    s_xor_b32 s7, s7, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s0, s0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s0, s9, s0
+; GFX9-NEXT:    s_add_i32 s9, s9, s0
+; GFX9-NEXT:    s_mul_hi_u32 s0, s7, s9
+; GFX9-NEXT:    s_mul_i32 s9, s0, s1
+; GFX9-NEXT:    s_sub_i32 s7, s7, s9
+; GFX9-NEXT:    s_add_i32 s10, s0, 1
+; GFX9-NEXT:    s_sub_i32 s9, s7, s1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX9-NEXT:    s_add_i32 s9, s0, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s1
+; GFX9-NEXT:    s_cselect_b32 s1, s9, s0
+; GFX9-NEXT:    s_abs_i32 s7, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v5
-; GFX9-NEXT:    s_ashr_i32 s11, s10, 31
-; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    s_xor_b32 s1, s1, s8
+; GFX9-NEXT:    s_sub_i32 s10, 0, s7
+; GFX9-NEXT:    s_sub_i32 s8, s1, s8
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_add_i32 s10, s10, s11
-; GFX9-NEXT:    s_xor_b32 s8, s11, s8
-; GFX9-NEXT:    s_sub_i32 s4, s6, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX9-NEXT:    s_xor_b32 s6, s9, s6
+; GFX9-NEXT:    s_abs_i32 s9, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s10, s11
-; GFX9-NEXT:    s_sub_i32 s10, 0, s7
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
-; GFX9-NEXT:    s_mul_i32 s10, s10, s11
-; GFX9-NEXT:    s_mul_hi_u32 s10, s11, s10
-; GFX9-NEXT:    s_add_i32 s11, s11, s10
-; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s11
-; GFX9-NEXT:    s_mul_i32 s11, s10, s7
-; GFX9-NEXT:    s_sub_i32 s6, s6, s11
-; GFX9-NEXT:    s_add_i32 s12, s10, 1
-; GFX9-NEXT:    s_sub_i32 s11, s6, s7
-; GFX9-NEXT:    s_cmp_ge_u32 s6, s7
-; GFX9-NEXT:    s_cselect_b32 s10, s12, s10
-; GFX9-NEXT:    s_cselect_b32 s6, s11, s6
-; GFX9-NEXT:    s_add_i32 s11, s10, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s6, s7
-; GFX9-NEXT:    s_cselect_b32 s6, s11, s10
-; GFX9-NEXT:    s_ashr_i32 s7, s9, 31
-; GFX9-NEXT:    s_add_i32 s9, s9, s7
-; GFX9-NEXT:    s_xor_b32 s9, s9, s7
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 31
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_i32 s10, s10, s1
+; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s10
+; GFX9-NEXT:    s_add_i32 s1, s1, s10
+; GFX9-NEXT:    s_mul_hi_u32 s1, s9, s1
+; GFX9-NEXT:    s_mul_i32 s10, s1, s7
+; GFX9-NEXT:    s_sub_i32 s9, s9, s10
+; GFX9-NEXT:    s_add_i32 s11, s1, 1
+; GFX9-NEXT:    s_sub_i32 s10, s9, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
+; GFX9-NEXT:    s_cselect_b32 s1, s11, s1
+; GFX9-NEXT:    s_cselect_b32 s9, s10, s9
+; GFX9-NEXT:    s_add_i32 s10, s1, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
+; GFX9-NEXT:    s_cselect_b32 s7, s10, s1
+; GFX9-NEXT:    s_abs_i32 s9, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_readfirstlane_b32 s11, v6
-; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9-NEXT:    s_xor_b32 s7, s7, s6
+; GFX9-NEXT:    s_sub_i32 s11, 0, s9
+; GFX9-NEXT:    s_sub_i32 s6, s7, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_add_i32 s11, s11, s12
-; GFX9-NEXT:    s_xor_b32 s7, s12, s7
-; GFX9-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX9-NEXT:    s_xor_b32 s4, s10, s4
+; GFX9-NEXT:    s_abs_i32 s10, s10
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s8, s11, s12
-; GFX9-NEXT:    s_sub_i32 s11, 0, s9
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s11, s11, s7
+; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s11
+; GFX9-NEXT:    s_add_i32 s7, s7, s11
+; GFX9-NEXT:    s_mul_hi_u32 s7, s10, s7
+; GFX9-NEXT:    s_mul_i32 s11, s7, s9
+; GFX9-NEXT:    s_sub_i32 s10, s10, s11
+; GFX9-NEXT:    s_add_i32 s12, s7, 1
+; GFX9-NEXT:    s_sub_i32 s11, s10, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s10, s9
+; GFX9-NEXT:    s_cselect_b32 s7, s12, s7
+; GFX9-NEXT:    s_cselect_b32 s10, s11, s10
+; GFX9-NEXT:    s_add_i32 s11, s7, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s10, s9
+; GFX9-NEXT:    s_cselect_b32 s7, s11, s7
+; GFX9-NEXT:    s_abs_i32 s9, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s9
+; GFX9-NEXT:    s_xor_b32 s7, s7, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    s_sub_i32 s8, 0, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    s_sub_i32 s4, s7, s4
 ; GFX9-NEXT:    v_readfirstlane_b32 s10, v7
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX9-NEXT:    s_mul_i32 s11, s11, s12
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s11
-; GFX9-NEXT:    s_add_i32 s12, s12, s11
-; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s12
-; GFX9-NEXT:    s_mul_i32 s12, s11, s9
-; GFX9-NEXT:    s_sub_i32 s8, s8, s12
-; GFX9-NEXT:    s_add_i32 s13, s11, 1
-; GFX9-NEXT:    s_sub_i32 s12, s8, s9
-; GFX9-NEXT:    s_cmp_ge_u32 s8, s9
-; GFX9-NEXT:    s_cselect_b32 s11, s13, s11
-; GFX9-NEXT:    s_cselect_b32 s8, s12, s8
-; GFX9-NEXT:    s_add_i32 s12, s11, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s8, s9
-; GFX9-NEXT:    s_cselect_b32 s8, s12, s11
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
-; GFX9-NEXT:    s_xor_b32 s5, s5, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_ashr_i32 s4, s10, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    s_xor_b32 s6, s8, s7
-; GFX9-NEXT:    s_xor_b32 s8, s4, s9
-; GFX9-NEXT:    s_sub_i32 s6, s6, s7
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    s_sub_i32 s7, 0, s5
-; GFX9-NEXT:    s_add_i32 s10, s10, s4
-; GFX9-NEXT:    s_xor_b32 s4, s10, s4
-; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    s_mul_i32 s7, s7, s9
-; GFX9-NEXT:    s_mul_hi_u32 s7, s9, s7
-; GFX9-NEXT:    s_add_i32 s9, s9, s7
-; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s9
-; GFX9-NEXT:    s_mul_i32 s9, s7, s5
-; GFX9-NEXT:    s_sub_i32 s4, s4, s9
+; GFX9-NEXT:    s_abs_i32 s6, s10
+; GFX9-NEXT:    s_xor_b32 s5, s10, s5
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX9-NEXT:    s_mul_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_hi_u32 s8, s7, s8
+; GFX9-NEXT:    s_add_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX9-NEXT:    s_mul_i32 s8, s7, s9
+; GFX9-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX9-NEXT:    s_add_i32 s10, s7, 1
-; GFX9-NEXT:    s_sub_i32 s9, s4, s5
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX9-NEXT:    s_sub_i32 s8, s6, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
 ; GFX9-NEXT:    s_cselect_b32 s7, s10, s7
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
-; GFX9-NEXT:    s_add_i32 s9, s7, 1
-; GFX9-NEXT:    s_cmp_ge_u32 s4, s5
-; GFX9-NEXT:    s_cselect_b32 s4, s9, s7
-; GFX9-NEXT:    s_xor_b32 s4, s4, s8
-; GFX9-NEXT:    s_sub_i32 s4, s4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_add_i32 s8, s7, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s7
+; GFX9-NEXT:    s_xor_b32 s6, s6, s5
+; GFX9-NEXT:    s_sub_i32 s5, s6, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2009,20 +1974,19 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 25
-; GCN-NEXT:    v_bfe_i32 v1, v1, 24, 1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v1
+; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 25
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
+; GCN-NEXT:    v_max_i32_e32 v2, v1, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_bfe_i32 v5, v0, 0, 25
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
@@ -2030,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v1, v2
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v2, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2057,20 +2021,19 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_mov_b32 s0, s4
 ; TONGA-NEXT:    s_mov_b32 s1, s5
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_bfe_i32 v2, v1, 0, 25
-; TONGA-NEXT:    v_bfe_i32 v1, v1, 24, 1
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v1
+; TONGA-NEXT:    v_bfe_i32 v1, v1, 0, 25
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, 0, v1
+; TONGA-NEXT:    v_max_i32_e32 v2, v1, v2
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
-; TONGA-NEXT:    v_bfe_i32 v5, v0, 0, 25
+; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v0
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
+; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
@@ -2078,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v1, v2
+; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v2, v1
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2102,42 +2065,39 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_mov_b32 s8, s6
 ; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x190000
-; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x10018
-; GFX9-NEXT:    s_add_i32 s1, s1, s6
-; GFX9-NEXT:    s_xor_b32 s7, s1, s6
+; GFX9-NEXT:    s_bfe_i32 s6, s0, 0x190000
+; GFX9-NEXT:    s_abs_i32 s7, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    s_mov_b32 s1, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v1
-; GFX9-NEXT:    s_bfe_i32 s5, s4, 0x190000
-; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x10018
-; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x190000
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_xor_b32 s5, s4, s6
+; GFX9-NEXT:    s_sub_i32 s6, 0, s7
+; GFX9-NEXT:    s_abs_i32 s4, s4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s4, s6
-; GFX9-NEXT:    s_xor_b32 s4, s5, s4
-; GFX9-NEXT:    s_sub_i32 s5, 0, s7
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 31
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s5, s5, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s5
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
-; GFX9-NEXT:    s_mul_i32 s8, s5, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s6, s8, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s8
+; GFX9-NEXT:    s_mul_i32 s8, s6, s7
 ; GFX9-NEXT:    s_sub_i32 s4, s4, s8
-; GFX9-NEXT:    s_add_i32 s9, s5, 1
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
 ; GFX9-NEXT:    s_sub_i32 s8, s4, s7
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
 ; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
-; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
 ; GFX9-NEXT:    s_cmp_ge_u32 s4, s7
-; GFX9-NEXT:    s_cselect_b32 s4, s8, s5
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
-; GFX9-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s6
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x190000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index b086640c72f80..c310e257adadc 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -147,11 +147,11 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s9, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
 ; GCN-IR-NEXT:    s_sub_u32 s12, s6, s0
 ; GCN-IR-NEXT:    s_subb_u32 s13, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[8:9]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
@@ -357,13 +357,13 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v12, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v12
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v12, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v12
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, v1, v12, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v13, v2
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v13, v3
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v13
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v3, v13
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
@@ -587,43 +587,39 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv32_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s8
-; GCN-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-NEXT:    s_abs_i32 s9, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_abs_i32 s0, s3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_xor_b32 s1, s3, s8
+; GCN-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
-; GCN-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s2
-; GCN-NEXT:    s_xor_b32 s3, s3, s2
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    s_xor_b32 s0, s2, s8
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_mul_i32 s2, s1, s9
-; GCN-NEXT:    s_sub_i32 s2, s3, s2
-; GCN-NEXT:    s_add_i32 s8, s1, 1
-; GCN-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-NEXT:    s_add_i32 s3, s1, 1
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    s_add_i32 s8, s2, 1
+; GCN-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-NEXT:    s_add_i32 s3, s2, 1
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -632,43 +628,39 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ;
 ; GCN-IR-LABEL: s_test_sdiv32_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s8
-; GCN-IR-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-IR-NEXT:    s_abs_i32 s9, s8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_abs_i32 s0, s3
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_xor_b32 s1, s3, s8
+; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_add_i32 s3, s3, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s3, s2
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    s_xor_b32 s0, s2, s8
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-IR-NEXT:    s_mul_i32 s2, s1, s9
-; GCN-IR-NEXT:    s_sub_i32 s2, s3, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s1, 1
-; GCN-IR-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-IR-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-IR-NEXT:    s_add_i32 s3, s1, 1
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-IR-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-IR-NEXT:    s_add_i32 s8, s2, 1
+; GCN-IR-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-IR-NEXT:    s_add_i32 s3, s2, 1
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-IR-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -688,41 +680,38 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s8
-; GCN-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[2:3], 33
+; GCN-NEXT:    s_abs_i32 s9, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
+; GCN-NEXT:    s_abs_i32 s0, s2
+; GCN-NEXT:    s_xor_b32 s1, s2, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s0, s3, s8
+; GCN-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_mul_i32 s3, s1, s9
-; GCN-NEXT:    s_sub_i32 s2, s2, s3
-; GCN-NEXT:    s_add_i32 s8, s1, 1
-; GCN-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-NEXT:    s_add_i32 s3, s1, 1
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    s_add_i32 s8, s2, 1
+; GCN-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-NEXT:    s_add_i32 s3, s2, 1
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -735,41 +724,38 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-IR-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s8
-; GCN-IR-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 33
+; GCN-IR-NEXT:    s_abs_i32 s9, s8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s0, s2
+; GCN-IR-NEXT:    s_xor_b32 s1, s2, s8
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    s_xor_b32 s2, s2, s3
-; GCN-IR-NEXT:    s_xor_b32 s0, s3, s8
+; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-IR-NEXT:    s_mul_i32 s3, s1, s9
-; GCN-IR-NEXT:    s_sub_i32 s2, s2, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s1, 1
-; GCN-IR-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-IR-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-IR-NEXT:    s_add_i32 s3, s1, 1
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-IR-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-IR-NEXT:    s_add_i32 s8, s2, 1
+; GCN-IR-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-IR-NEXT:    s_add_i32 s3, s2, 1
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-IR-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -856,41 +842,38 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s8
-; GCN-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[2:3], 39
+; GCN-NEXT:    s_abs_i32 s9, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
+; GCN-NEXT:    s_abs_i32 s0, s2
+; GCN-NEXT:    s_xor_b32 s1, s2, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s0, s3, s8
+; GCN-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_mul_i32 s3, s1, s9
-; GCN-NEXT:    s_sub_i32 s2, s2, s3
-; GCN-NEXT:    s_add_i32 s8, s1, 1
-; GCN-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-NEXT:    s_add_i32 s3, s1, 1
-; GCN-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    s_add_i32 s8, s2, 1
+; GCN-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-NEXT:    s_add_i32 s3, s2, 1
+; GCN-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -903,41 +886,38 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-IR-NEXT:    s_ashr_i32 s8, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s8
-; GCN-IR-NEXT:    s_xor_b32 s9, s2, s8
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 39
+; GCN-IR-NEXT:    s_abs_i32 s9, s8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s0, s2
+; GCN-IR-NEXT:    s_xor_b32 s1, s2, s8
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    s_xor_b32 s2, s2, s3
-; GCN-IR-NEXT:    s_xor_b32 s0, s3, s8
+; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 31
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-IR-NEXT:    s_mul_i32 s3, s1, s9
-; GCN-IR-NEXT:    s_sub_i32 s2, s2, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s1, 1
-; GCN-IR-NEXT:    s_sub_i32 s3, s2, s9
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s8, s1
-; GCN-IR-NEXT:    s_cselect_b32 s2, s3, s2
-; GCN-IR-NEXT:    s_add_i32 s3, s1, 1
-; GCN-IR-NEXT:    s_cmp_ge_u32 s2, s9
-; GCN-IR-NEXT:    s_cselect_b32 s1, s3, s1
-; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-IR-NEXT:    s_mul_i32 s3, s2, s9
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-IR-NEXT:    s_add_i32 s8, s2, 1
+; GCN-IR-NEXT:    s_sub_i32 s3, s0, s9
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s2, s8, s2
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s0
+; GCN-IR-NEXT:    s_add_i32 s3, s2, 1
+; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, s2
+; GCN-IR-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-IR-NEXT:    s_sub_i32 s0, s0, s1
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -1100,11 +1080,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
 ; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 16
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    s_mov_b32 s5, s4
 ; GCN-IR-NEXT:    s_sub_u32 s12, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s13, s7, s2
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
@@ -1310,7 +1290,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s5, s4
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
 ; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[2:3]
@@ -1493,8 +1473,8 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v12, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v12, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v12
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v12
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
@@ -1686,8 +1666,8 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v12, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v12, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v12
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v12
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
@@ -1788,8 +1768,8 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v10, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, v10, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v10
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v0, v10
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, v1, v10, vcc
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ed7f27b367fda..93fab7dff253b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -335,11 +335,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v14
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v14
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GCN-IR-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
@@ -655,37 +655,34 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-NEXT:    s_abs_i32 s8, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
+; GCN-NEXT:    s_abs_i32 s3, s2
+; GCN-NEXT:    s_ashr_i32 s0, s2, 31
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-NEXT:    s_sub_i32 s0, s2, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-NEXT:    s_sub_i32 s1, s3, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -699,37 +696,34 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
-; GCN-IR-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s8, s2
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s3, s2
+; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 31
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    s_xor_b32 s2, s2, s3
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-IR-NEXT:    s_sub_i32 s0, s2, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-IR-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-IR-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-IR-NEXT:    s_sub_i32 s1, s3, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -750,37 +744,34 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-NEXT:    s_abs_i32 s8, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
+; GCN-NEXT:    s_abs_i32 s3, s2
+; GCN-NEXT:    s_ashr_i32 s0, s2, 31
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-NEXT:    s_sub_i32 s0, s2, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-NEXT:    s_sub_i32 s1, s3, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -794,37 +785,34 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
-; GCN-IR-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s8, s2
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s3, s2
+; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 31
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GCN-IR-NEXT:    s_xor_b32 s2, s2, s3
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-IR-NEXT:    s_sub_i32 s0, s2, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-IR-NEXT:    s_sub_i32 s0, s0, s3
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-IR-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-IR-NEXT:    s_sub_i32 s1, s3, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -845,36 +833,33 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-NEXT:    s_abs_i32 s8, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
-; GCN-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s2
-; GCN-NEXT:    s_xor_b32 s3, s3, s2
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_abs_i32 s2, s3
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-NEXT:    s_sub_i32 s0, s3, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s2
-; GCN-NEXT:    s_sub_i32 s0, s0, s2
+; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_ashr_i32 s0, s3, 31
+; GCN-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-NEXT:    s_sub_i32 s1, s2, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -887,36 +872,33 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-IR-NEXT:    s_add_i32 s2, s2, s3
-; GCN-IR-NEXT:    s_xor_b32 s8, s2, s3
+; GCN-IR-NEXT:    s_abs_i32 s8, s2
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_add_i32 s3, s3, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s3, s2
+; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_abs_i32 s2, s3
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GCN-IR-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-IR-NEXT:    s_mul_i32 s0, s0, s8
-; GCN-IR-NEXT:    s_sub_i32 s0, s3, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_sub_i32 s1, s0, s8
-; GCN-IR-NEXT:    s_cmp_ge_u32 s0, s8
-; GCN-IR-NEXT:    s_cselect_b32 s0, s1, s0
-; GCN-IR-NEXT:    s_xor_b32 s0, s0, s2
-; GCN-IR-NEXT:    s_sub_i32 s0, s0, s2
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
+; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v0
+; GCN-IR-NEXT:    s_mul_i32 s1, s1, s8
+; GCN-IR-NEXT:    s_sub_i32 s1, s2, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_sub_i32 s2, s1, s8
+; GCN-IR-NEXT:    s_cmp_ge_u32 s1, s8
+; GCN-IR-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-IR-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-IR-NEXT:    s_sub_i32 s0, s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -1074,19 +1056,19 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 31
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
+; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
 ; GCN-IR-NEXT:    s_sub_u32 s8, s6, s10
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[2:3], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[14:15]
 ; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
@@ -1215,17 +1197,17 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[4:5], 24
 ; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 24
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 16
-; GCN-IR-NEXT:    s_lshl_b64 s[4:5], s[4:5], 16
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 16
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[4:5], 16
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[2:3], 16
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_ashr_i32 s10, s5, 31
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 16
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 16
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s4, s4, s2
 ; GCN-IR-NEXT:    s_subb_u32 s5, s5, s2
+; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 33cc8e96f663f..1c303de55c95d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -177,56 +177,55 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl $156, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ebx, %ebp
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
@@ -238,76 +237,78 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %ebp, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %ebp, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    addl $32, %ebp
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %edx, %ebp
-; X86-NEXT:    addl $64, %ebp
+; X86-NEXT:    bsrl %edi, %edi
+; X86-NEXT:    xorl $31, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    cmovnel %edx, %edi
+; X86-NEXT:    addl $64, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    cmovnel %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    cmovnel %ecx, %edi
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    bsrl %ebp, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
 ; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebp, %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %edi, %ecx
 ; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    cmovnel %ebx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    xorl $127, %edi
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    je .LBB4_8
@@ -316,10 +317,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -333,27 +334,27 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 144(%esp,%edi), %edx
-; X86-NEXT:    movl 148(%esp,%edi), %esi
+; X86-NEXT:    movl 148(%esp,%edi), %edx
+; X86-NEXT:    movl 152(%esp,%edi), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%edi), %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shrl %ebx
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl 136(%esp,%edi), %edx
+; X86-NEXT:    movl 144(%esp,%edi), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    shrl %ebp
+; X86-NEXT:    shrl %cl, %ebp
+; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    movl 140(%esp,%edi), %edx
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    jae .LBB4_3
@@ -363,38 +364,37 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %dl, %ch
 ; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    movb %dl, %cl
 ; X86-NEXT:    shrb $3, %cl
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl 100(%esp,%edx), %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%edx), %edi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 104(%esp,%edx), %ebx
+; X86-NEXT:    movl 100(%esp,%edx), %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %ebp
-; X86-NEXT:    movl 88(%esp,%edx), %ebx
+; X86-NEXT:    shrdl %cl, %ebx, %ebp
 ; X86-NEXT:    movl 92(%esp,%edx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%edx), %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    notb %cl
@@ -403,9 +403,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NEXT:    shrl %cl, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -415,7 +415,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -424,26 +423,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %esi
+; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -452,15 +451,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
@@ -468,21 +467,22 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -491,15 +491,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %edi, %ecx
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    shldl $1, %ebp, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    shldl $1, %eax, %ebx
-; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    shldl $1, %eax, %ebp
+; X86-NEXT:    orl %ecx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
@@ -508,36 +508,35 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    xorl %ecx, %ebp
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    sbbl %ecx, %ebp
 ; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebx, 8(%ecx)
+; X86-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edx
@@ -545,7 +544,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
@@ -556,35 +555,36 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    subl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebp, %edi
+; X86-NEXT:    sbbl %esi, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    addl $152, %esp
+; X86-NEXT:    addl $156, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index ace78b38d53ed..fbc363f77ec42 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -36,9 +36,9 @@ define void @f() nounwind {
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $30, %ecx
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    shrdl $1, %eax, %ecx
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    shrdl $1, %eax, %ecx
 ; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    sbbl %eax, %edi



More information about the llvm-commits mailing list