[llvm] fbdea5a - [AMDGPU] Always select s_cselect_b32 for uniform 'select' SDNode

Alexander Timofeev via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 15 13:04:16 PDT 2022


Author: Alexander Timofeev
Date: 2022-09-15T22:03:56+02:00
New Revision: fbdea5a2e9169be524463aae6663d958b94a2685

URL: https://github.com/llvm/llvm-project/commit/fbdea5a2e9169be524463aae6663d958b94a2685
DIFF: https://github.com/llvm/llvm-project/commit/fbdea5a2e9169be524463aae6663d958b94a2685.diff

LOG: [AMDGPU] Always select s_cselect_b32 for uniform 'select' SDNode

This patch contains changes necessary to carry physical condition register (SCC) dependencies through the SDNode scheduler.  It adds the edge in the SDNodeScheduler dependency graph instead of inserting the SCC copy between each definition and use. This approach lets the scheduler place instructions in an optimal way placing the copy only when the dependency cannot be resolved.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D133593

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/MachineRegisterInfo.cpp
    llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
    llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
    llvm/test/CodeGen/AMDGPU/fceil64.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
    llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
    llvm/test/CodeGen/AMDGPU/mad_uint24.ll
    llvm/test/CodeGen/AMDGPU/sad.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
    llvm/test/CodeGen/AMDGPU/select-opt.ll
    llvm/test/CodeGen/AMDGPU/select-vectors.ll
    llvm/test/CodeGen/AMDGPU/select64.ll
    llvm/test/CodeGen/AMDGPU/selectcc.ll
    llvm/test/CodeGen/AMDGPU/setcc64.ll
    llvm/test/CodeGen/AMDGPU/shift-i128.ll
    llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/trunc.ll
    llvm/test/CodeGen/AMDGPU/udiv.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/vselect.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7f6580d05e516..3daa9b5b4c555 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4023,6 +4023,23 @@ class TargetLowering : public TargetLoweringBase {
     return false;
   }
 
+  /// Allows the target to handle physreg-carried dependency
+  /// in target-specific way. Used from the ScheduleDAGSDNodes to decide whether
+  /// to add the edge to the dependency graph.
+  /// Def - input: Selection DAG node defininfg physical register
+  /// User - input: Selection DAG node using physical register
+  /// Op - input: Number of User operand
+  /// PhysReg - inout: set to the physical register if the edge is
+  /// necessary, unchanged otherwise
+  /// Cost - inout: physical register copy cost.
+  /// Returns 'true' is the edge is necessary, 'false' otherwise
+  virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                         const TargetRegisterInfo *TRI,
+                                         const TargetInstrInfo *TII,
+                                         unsigned &PhysReg, int &Cost) const {
+    return false;
+  }
+
   /// Target-specific combining of register parts into its original value
   virtual SDValue
   joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,

diff  --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index c96de7ab5c597..e48f1beaae2be 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -79,10 +79,10 @@ constrainRegClass(MachineRegisterInfo &MRI, Register Reg,
   return NewRC;
 }
 
-const TargetRegisterClass *
-MachineRegisterInfo::constrainRegClass(Register Reg,
-                                       const TargetRegisterClass *RC,
-                                       unsigned MinNumRegs) {
+const TargetRegisterClass *MachineRegisterInfo::constrainRegClass(
+    Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs) {
+  if (Reg.isPhysical())
+    return nullptr;
   return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
 }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index bb0f29bbaad4f..ba986159e86c6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -110,11 +110,15 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
 static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
                                       const TargetRegisterInfo *TRI,
                                       const TargetInstrInfo *TII,
+                                      const TargetLowering &TLI,
                                       unsigned &PhysReg, int &Cost) {
   if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
     return;
 
   unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost))
+    return;
+
   if (Register::isVirtualRegister(Reg))
     return;
 
@@ -485,7 +489,8 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         unsigned PhysReg = 0;
         int Cost = 1;
         // Determine if this is a physical register dependency.
-        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
+        const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost);
         assert((PhysReg == 0 || !isChain) &&
                "Chain dependence via physreg data?");
         // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 23278a954d086..89132d232a570 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12988,3 +12988,28 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
     return MONoClobber;
   return MachineMemOperand::MONone;
 }
+
+bool SITargetLowering::checkForPhysRegDependency(
+    SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
+    const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
+  if (User->getOpcode() != ISD::CopyToReg)
+    return false;
+  if (!Def->isMachineOpcode())
+    return false;
+  MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
+  if (!MDef)
+    return false;
+
+  unsigned ResNo = User->getOperand(Op).getResNo();
+  if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
+    return false;
+  const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
+  if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
+    PhysReg = AMDGPU::SCC;
+    const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
+    Cost = RC->getCopyCost();
+    return true;
+  }
+  return false;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 81bd8dedc303d..9e8ff565fe6bd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -479,6 +479,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
   bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
 
+  bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                 const TargetRegisterInfo *TRI,
+                                 const TargetInstrInfo *TII, unsigned &PhysReg,
+                                 int &Cost) const override;
+
   bool isKnownNeverNaNForTargetNode(SDValue Op,
                                     const SelectionDAG &DAG,
                                     bool SNaN = false,

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 6b52f8e82dbf8..15edf7c07dd1e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -931,6 +931,8 @@ const TargetRegisterClass *
 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
     return getEquivalentVGPRClass(RC);
+  if (RC == &AMDGPU::SCC_CLASSRegClass)
+    return getWaveMaskRegClass();
 
   return RC;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 37d20045adb52..5e6f160e01454 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -523,8 +523,7 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
 class SelectPat<SDPatternOperator select> : PatFrag <
   (ops node:$src1, node:$src2),
   (select SCC, $src1, $src2),
-  [{ return Subtarget->hasScalarCompareEq64() &&
-            N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+  [{ return !N->isDivergent(); }]
 >;
 
 let Uses = [SCC] in {

diff  --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 90ca343a65096..32463af9f4377 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -58,7 +58,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
 ; GFX7 v_cmp_ne_u32
-; GFX7: v_cndmask_b32
+; GFX7: s_cselect_b32
 ; GFX8: s_cmp_lg_u32
 ; GFX8-NOT: v_cmp_ne_u32
 ; GFX8: s_cselect_b32

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 5debd68b30c0f..f5760793828b3 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -9,33 +9,26 @@
 
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
-; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b64 vcc, -1, 0
-; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
-; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
 
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
-; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9: s_cselect_b64 vcc, -1, 0
-; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
-; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_SHARED_BASE]], 0
+; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
 
 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
 ; At most 2 digits. Make sure src_shared_base is not counted as a high
 ; number SGPR.
 
-; CI: NumSgprs: {{[0-9][0-9]+}}
-; GFX9: NumSgprs: {{[0-9]+}}
+; HSA: NumSgprs: {{[0-9]+}}
 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
   store volatile i32 7, i32* %stof
@@ -75,33 +68,26 @@ define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
 
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
-; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
-; CI-DAG: s_cselect_b64 vcc, -1, 0
-; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
-; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
 
 ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
 ; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
-; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; GFX9: s_cmp_lg_u32 [[PTR]], -1
-; GFX9: s_cselect_b64 vcc, -1, 0
-; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
-; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
+; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[SSRC_PRIVATE_BASE]], 0
+; GFX9: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
 
 ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
 
-; CI: NumSgprs: {{[0-9][0-9]+}}
-; GFX9: NumSgprs: {{[0-9]+}}
+; HSA: NumSgprs: {{[0-9]+}}
 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
   store volatile i32 7, i32* %stof
@@ -155,14 +141,16 @@ define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
-; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
-; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; CI-DAG: v_cmp_ne_u64_e64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
+; CI-DAG: s_and_b64 s{{[[0-9]+:[0-9]+]}}, s[[[CMP_LO]]:[[CMP_HI]]], exec
+; CI-DAG: s_cselect_b32 [[CASTPTR:s[0-9]+]], s[[PTR_LO]], -1
+; CI-DAG: v_mov_b32_e32 [[VCASTPTR:v[0-9]+]], [[CASTPTR]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
-; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
+; CI-DAG: ds_write_b32 [[VCASTPTR]], v[[K]]
+; GFX9-DAG: ds_write_b32 [[CASTPTR]], v[[K]]
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
@@ -175,14 +163,19 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
-; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
-; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; CI-DAG v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
+; CI-DAG v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; CI-DAG v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; CI-DAG: v_cmp_ne_u64_e64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
+; CI-DAG: s_and_b64 s{{[[0-9]+:[0-9]+]}}, s[[[CMP_LO]]:[[CMP_HI]]], exec
+; CI-DAG: s_cselect_b32 [[CASTPTR:s[0-9]+]], s[[PTR_LO]], -1
+; CI-DAG: v_mov_b32_e32 [[VCASTPTR:v[0-9]+]], [[CASTPTR]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
 ; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
 ; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
-; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+; CI: buffer_store_dword v[[K]], [[VCASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+; GFX9: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
   store volatile i32 0, i32 addrspace(5)* %ftos

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 7f2bdeec56262..03bcd58786139 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -521,266 +521,258 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
 define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
-; GFX908-NEXT:    global_load_ushort v24, v[0:1], off glc
+; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x10
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0
-; GFX908-NEXT:    s_load_dword s5, s[4:5], 0x18
-; GFX908-NEXT:    s_mov_b32 s4, 0
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX908-NEXT:    s_load_dword s9, s[4:5], 0x18
+; GFX908-NEXT:    s_mov_b32 s8, 0
+; GFX908-NEXT:    s_mov_b32 s5, s8
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX908-NEXT:    s_sub_i32 s6, 0, s3
-; GFX908-NEXT:    s_lshl_b64 s[8:9], s[10:11], 5
-; GFX908-NEXT:    s_lshr_b32 s12, s5, 16
-; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v25, s5
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v26, s12
-; GFX908-NEXT:    s_or_b32 s8, s8, 28
-; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s10
-; GFX908-NEXT:    v_mov_b32_e32 v7, s11
-; GFX908-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX908-NEXT:    s_lshl_b64 s[6:7], s[0:1], 5
-; GFX908-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX908-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX908-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v2, s8
-; GFX908-NEXT:    v_mov_b32_e32 v3, s9
-; GFX908-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX908-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX908-NEXT:    v_sub_u32_e32 v4, s2, v4
-; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX908-NEXT:    v_subrev_u32_e32 v5, s3, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX908-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX908-NEXT:    v_lshlrev_b64 v[4:5], 5, v[0:1]
+; GFX908-NEXT:    s_sub_i32 s4, 0, s3
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s9
+; GFX908-NEXT:    v_mov_b32_e32 v19, 0
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX908-NEXT:    s_mul_i32 s4, s4, s10
+; GFX908-NEXT:    s_mul_hi_u32 s4, s10, s4
+; GFX908-NEXT:    s_add_i32 s10, s10, s4
+; GFX908-NEXT:    s_mul_hi_u32 s4, s2, s10
+; GFX908-NEXT:    s_mul_i32 s10, s4, s3
+; GFX908-NEXT:    s_sub_i32 s2, s2, s10
+; GFX908-NEXT:    s_add_i32 s11, s4, 1
+; GFX908-NEXT:    s_sub_i32 s10, s2, s3
+; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX908-NEXT:    s_cselect_b32 s4, s11, s4
+; GFX908-NEXT:    s_cselect_b32 s2, s10, s2
+; GFX908-NEXT:    s_add_i32 s10, s4, 1
+; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX908-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX908-NEXT:    s_lshr_b32 s9, s9, 16
+; GFX908-NEXT:    s_lshl_b64 s[12:13], s[4:5], 5
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s9
+; GFX908-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
+; GFX908-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
+; GFX908-NEXT:    s_or_b32 s10, s10, 28
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s2, v24
-; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX908-NEXT:    s_mul_i32 s1, s1, s2
-; GFX908-NEXT:    s_mul_hi_u32 s3, s0, s2
-; GFX908-NEXT:    s_mul_i32 s0, s0, s2
-; GFX908-NEXT:    s_add_i32 s1, s3, s1
-; GFX908-NEXT:    s_lshl_b64 s[8:9], s[0:1], 5
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v16
+; GFX908-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX908-NEXT:    s_mul_i32 s1, s1, s5
+; GFX908-NEXT:    s_mul_hi_u32 s9, s0, s5
+; GFX908-NEXT:    s_mul_i32 s0, s0, s5
+; GFX908-NEXT:    s_add_i32 s1, s9, s1
+; GFX908-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
-; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX908-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX908-NEXT:    s_add_u32 s6, s6, s4
+; GFX908-NEXT:    s_addc_u32 s7, s7, 0
+; GFX908-NEXT:    s_add_u32 s10, s10, s12
+; GFX908-NEXT:    s_addc_u32 s11, s11, s13
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
 ; GFX908-NEXT:    s_cbranch_scc0 .LBB3_1
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v8, 0
-; GFX908-NEXT:    v_mov_b32_e32 v9, 0
-; GFX908-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
-; GFX908-NEXT:    s_mov_b32 s5, s4
-; GFX908-NEXT:    v_mov_b32_e32 v13, s5
-; GFX908-NEXT:    v_mov_b32_e32 v15, s5
-; GFX908-NEXT:    v_mov_b32_e32 v17, s5
-; GFX908-NEXT:    v_mov_b32_e32 v12, s4
-; GFX908-NEXT:    v_mov_b32_e32 v14, s4
-; GFX908-NEXT:    v_mov_b32_e32 v16, s4
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v11, v3
-; GFX908-NEXT:    v_mov_b32_e32 v19, v13
-; GFX908-NEXT:    v_mov_b32_e32 v10, v2
-; GFX908-NEXT:    v_mov_b32_e32 v18, v12
+; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX908-NEXT:    s_mov_b32 s9, s8
+; GFX908-NEXT:    v_mov_b32_e32 v4, s8
+; GFX908-NEXT:    v_mov_b32_e32 v6, s8
+; GFX908-NEXT:    v_mov_b32_e32 v8, s8
+; GFX908-NEXT:    v_mov_b32_e32 v5, s9
+; GFX908-NEXT:    v_mov_b32_e32 v7, s9
+; GFX908-NEXT:    v_mov_b32_e32 v9, s9
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
+; GFX908-NEXT:    v_mov_b32_e32 v11, v5
+; GFX908-NEXT:    s_mov_b64 s[16:17], s[10:11]
+; GFX908-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX908-NEXT:    v_readfirstlane_b32 s3, v9
-; GFX908-NEXT:    s_add_u32 s2, s2, 1
-; GFX908-NEXT:    s_addc_u32 s3, s3, 0
-; GFX908-NEXT:    s_mul_hi_u32 s5, s6, s2
-; GFX908-NEXT:    s_mul_i32 s11, s7, s2
-; GFX908-NEXT:    s_mul_i32 s10, s6, s2
-; GFX908-NEXT:    s_mul_i32 s2, s6, s3
-; GFX908-NEXT:    s_add_i32 s2, s5, s2
-; GFX908-NEXT:    s_add_i32 s5, s2, s11
+; GFX908-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX908-NEXT:    s_add_u32 s5, s5, 1
+; GFX908-NEXT:    s_addc_u32 s9, s9, 0
+; GFX908-NEXT:    s_mul_hi_u32 s19, s2, s5
+; GFX908-NEXT:    s_mul_i32 s20, s3, s5
+; GFX908-NEXT:    s_mul_i32 s18, s2, s5
+; GFX908-NEXT:    s_mul_i32 s5, s2, s9
+; GFX908-NEXT:    s_add_i32 s5, s19, s5
+; GFX908-NEXT:    s_add_i32 s5, s5, s20
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX908-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
-; GFX908-NEXT:    v_mov_b32_e32 v20, s9
-; GFX908-NEXT:    v_add_co_u32_e64 v10, s[2:3], s8, v10
-; GFX908-NEXT:    v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3]
+; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX908-NEXT:    s_add_u32 s16, s16, s0
+; GFX908-NEXT:    s_addc_u32 s17, s17, s1
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_mov_b32_e32 v21, s5
-; GFX908-NEXT:    v_add_co_u32_e32 v20, vcc, s10, v10
-; GFX908-NEXT:    v_addc_co_u32_e32 v21, vcc, v11, v21, vcc
-; GFX908-NEXT:    global_load_dword v28, v[20:21], off offset:-12 glc
+; GFX908-NEXT:    s_add_u32 s20, s16, s18
+; GFX908-NEXT:    s_addc_u32 s21, s17, s5
+; GFX908-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v27, v[20:21], off offset:-8 glc
+; GFX908-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v22, v[20:21], off offset:-4 glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] offset:-4 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v20, v[20:21], off glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    ds_read_b64 v[20:21], v1
-; GFX908-NEXT:    ds_read_b64 v[22:23], v0
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    ds_read_b64 v[12:13], v19
+; GFX908-NEXT:    ds_read_b64 v[14:15], v0
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cbranch_vccnz .LBB3_4
 ; GFX908-NEXT:  ; %bb.6: ; %bb51
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; GFX908-NEXT:    v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; GFX908-NEXT:    v_add_f32_e32 v31, v25, v20
-; GFX908-NEXT:    v_add_f32_e32 v32, v26, v21
-; GFX908-NEXT:    v_add_f32_e32 v33, 0, v20
-; GFX908-NEXT:    v_add_f32_e32 v34, 0, v21
-; GFX908-NEXT:    v_add_f32_e32 v23, v29, v23
-; GFX908-NEXT:    v_add_f32_e32 v22, v28, v22
-; GFX908-NEXT:    v_add_f32_e32 v21, v30, v21
-; GFX908-NEXT:    v_add_f32_e32 v20, v27, v20
-; GFX908-NEXT:    v_add_f32_e32 v13, v13, v32
-; GFX908-NEXT:    v_add_f32_e32 v12, v12, v31
-; GFX908-NEXT:    v_add_f32_e32 v15, v15, v34
-; GFX908-NEXT:    v_add_f32_e32 v14, v14, v33
-; GFX908-NEXT:    v_add_f32_e32 v16, v16, v22
-; GFX908-NEXT:    v_add_f32_e32 v17, v17, v23
-; GFX908-NEXT:    v_add_f32_e32 v18, v18, v20
-; GFX908-NEXT:    v_add_f32_e32 v19, v19, v21
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX908-NEXT:    v_add_f32_e32 v24, v17, v12
+; GFX908-NEXT:    v_add_f32_e32 v25, v18, v13
+; GFX908-NEXT:    v_add_f32_e32 v26, 0, v12
+; GFX908-NEXT:    v_add_f32_e32 v27, 0, v13
+; GFX908-NEXT:    v_add_f32_e32 v15, v22, v15
+; GFX908-NEXT:    v_add_f32_e32 v14, v21, v14
+; GFX908-NEXT:    v_add_f32_e32 v13, v23, v13
+; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
+; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
+; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_branch .LBB3_4
 ;
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
-; GFX90A-NEXT:    global_load_ushort v28, v[0:1], off glc
+; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
-; GFX90A-NEXT:    s_load_dword s7, s[4:5], 0x18
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_mov_b32 s6, 0
+; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dword s9, s[4:5], 0x18
+; GFX90A-NEXT:    s_mov_b32 s8, 0
+; GFX90A-NEXT:    s_mov_b32 s5, s8
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s12, 0, s3
-; GFX90A-NEXT:    s_lshr_b32 s13, s7, 16
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s7
+; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
+; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s13
-; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[0:1], 5
-; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s9
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX90A-NEXT:    s_mul_i32 s4, s4, s10
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s10, s4
+; GFX90A-NEXT:    s_add_i32 s10, s10, s4
+; GFX90A-NEXT:    s_mul_hi_u32 s4, s2, s10
+; GFX90A-NEXT:    s_mul_i32 s10, s4, s3
+; GFX90A-NEXT:    s_sub_i32 s2, s2, s10
+; GFX90A-NEXT:    s_add_i32 s11, s4, 1
+; GFX90A-NEXT:    s_sub_i32 s10, s2, s3
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s4, s11, s4
+; GFX90A-NEXT:    s_cselect_b32 s2, s10, s2
+; GFX90A-NEXT:    s_add_i32 s10, s4, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX90A-NEXT:    s_cselect_b32 s4, s10, s4
+; GFX90A-NEXT:    s_lshr_b32 s9, s9, 16
+; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[4:5], 5
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s9
+; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
+; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
 ; GFX90A-NEXT:    s_or_b32 s10, s10, 28
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s12, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v0, s3
-; GFX90A-NEXT:    v_sub_u32_e32 v8, s2, v8
-; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v8
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GFX90A-NEXT:    v_subrev_u32_e32 v9, s3, v8
-; GFX90A-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v9, 1, v0
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s3, v8
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[8:9], 5, v[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], 0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s2, v28
-; GFX90A-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX90A-NEXT:    s_mul_i32 s1, s1, s2
-; GFX90A-NEXT:    s_mul_hi_u32 s3, s0, s2
-; GFX90A-NEXT:    s_mul_i32 s0, s0, s2
-; GFX90A-NEXT:    s_add_i32 s1, s3, s1
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v18
+; GFX90A-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX90A-NEXT:    s_mul_i32 s1, s1, s5
+; GFX90A-NEXT:    s_mul_hi_u32 s9, s0, s5
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s5
+; GFX90A-NEXT:    s_add_i32 s1, s9, s1
+; GFX90A-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v8
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX90A-NEXT:    s_add_u32 s6, s6, s4
+; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
+; GFX90A-NEXT:    s_add_u32 s10, s10, s12
+; GFX90A-NEXT:    s_addc_u32 s11, s11, s13
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_1
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off
-; GFX90A-NEXT:    s_mov_b32 s7, s6
-; GFX90A-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[20:21], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1]
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX90A-NEXT:    s_mov_b32 s9, s8
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
+; GFX90A-NEXT:    s_mov_b64 s[16:17], s[10:11]
+; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s7, v12
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v13
-; GFX90A-NEXT:    s_add_u32 s7, s7, 1
-; GFX90A-NEXT:    s_addc_u32 s9, s8, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s10, s4, s7
-; GFX90A-NEXT:    s_mul_i32 s11, s5, s7
-; GFX90A-NEXT:    s_mul_i32 s8, s4, s7
-; GFX90A-NEXT:    s_mul_i32 s7, s4, s9
-; GFX90A-NEXT:    s_add_i32 s7, s10, s7
-; GFX90A-NEXT:    s_add_i32 s7, s7, s11
+; GFX90A-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v5
+; GFX90A-NEXT:    s_add_u32 s5, s5, 1
+; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s19, s2, s5
+; GFX90A-NEXT:    s_mul_i32 s20, s3, s5
+; GFX90A-NEXT:    s_mul_i32 s18, s2, s5
+; GFX90A-NEXT:    s_mul_i32 s5, s2, s9
+; GFX90A-NEXT:    s_add_i32 s5, s19, s5
+; GFX90A-NEXT:    s_add_i32 s5, s5, s20
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
-; GFX90A-NEXT:    v_mov_b32_e32 v24, s3
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, s2, v14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v24, vcc
-; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[12:13]
+; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    s_add_u32 s16, s16, s0
+; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT:    s_addc_u32 s17, s17, s1
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_mov_b32_e32 v25, s7
-; GFX90A-NEXT:    v_add_co_u32_e32 v24, vcc, s8, v14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v25, vcc, v15, v25, vcc
-; GFX90A-NEXT:    global_load_dword v30, v[24:25], off offset:-12 glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v29, v[24:25], off offset:-8 glc
+; GFX90A-NEXT:    s_add_u32 s20, s16, s18
+; GFX90A-NEXT:    s_addc_u32 s21, s17, s5
+; GFX90A-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v26, v[24:25], off offset:-4 glc
+; GFX90A-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v26, v[24:25], off glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] offset:-4 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    ; kill: killed $vgpr24 killed $vgpr25
-; GFX90A-NEXT:    ds_read_b64 v[24:25], v1
+; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    ds_read_b64 v[26:27], v0
-; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
+; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.6: ; %bb51
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v32, v29
-; GFX90A-NEXT:    v_pk_add_f32 v[34:35], v[2:3], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0]
-; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[30:31], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[32:33], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[34:35]
-; GFX90A-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[36:37]
-; GFX90A-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[24:25]
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[0:1], v[14:15]
+; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0]
+; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
+; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_branch .LBB3_4
 bb:
   %i = load volatile i16, i16 addrspace(4)* undef, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index bce6ba45c3121..49f016407c108 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -54,44 +54,49 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s0, s2, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_add_i32 s5, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
@@ -145,16 +150,18 @@ define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s3, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s3, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s0, s2, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -248,63 +255,68 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_xor_b32 s2, s2, s9
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_xor_b32 s0, s9, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX6-NEXT:    s_mul_i32 s1, s1, s3
+; GFX6-NEXT:    s_sub_i32 s1, s2, s1
+; GFX6-NEXT:    s_sub_i32 s2, s1, s3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
 ; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
 ; GFX9-NEXT:    s_add_i32 s2, s2, s5
-; GFX9-NEXT:    s_xor_b32 s2, s2, s5
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s6, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s6
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_sub_i32 s6, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX9-NEXT:    s_add_i32 s6, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, i32 addrspace(1)* %out
@@ -372,16 +384,18 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s4
+; GFX6-NEXT:    s_sub_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s7, s6, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_sub_i32 s7, s6, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s6
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -605,15 +619,15 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -679,24 +693,24 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX6-NEXT:    s_sext_i32_i16 s3, s4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
-; GFX6-NEXT:    s_xor_b32 s3, s3, s2
+; GFX6-NEXT:    s_ashr_i32 s5, s4, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT:    s_sext_i32_i16 s2, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX6-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
-; GFX6-NEXT:    s_or_b32 s3, s3, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
+; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX6-NEXT:    s_or_b32 s6, s2, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX6-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -909,15 +923,15 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -985,22 +999,22 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x80008
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX6-NEXT:    s_sext_i32_i8 s5, s4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s2, s5, s2
+; GFX6-NEXT:    s_sext_i32_i8 s3, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX6-NEXT:    s_xor_b32 s2, s3, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX6-NEXT:    s_or_b32 s2, s2, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    s_lshr_b32 s5, s4, 8
+; GFX6-NEXT:    s_or_b32 s6, s2, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX6-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -1174,88 +1188,100 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ;
 ; GFX6-LABEL: udiv_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s15, 0xf000
-; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s19, 0xf000
+; GFX6-NEXT:    s_mov_b32 s18, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX6-NEXT:    s_sub_i32 s2, 0, s12
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s14
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s15
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s9
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s10
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s12
+; GFX6-NEXT:    s_sub_i32 s2, s8, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s12
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s12
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s12
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT:    s_sub_i32 s4, 0, s13
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX6-NEXT:    s_mul_i32 s4, s4, s13
+; GFX6-NEXT:    s_sub_i32 s4, s9, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s13
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s13
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_sub_i32 s6, 0, s14
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[4:5]
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v3
+; GFX6-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX6-NEXT:    s_mul_i32 s6, s6, s14
+; GFX6-NEXT:    s_sub_i32 s6, s10, s6
+; GFX6-NEXT:    s_sub_i32 s7, s6, s14
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s14
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s14
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX6-NEXT:    s_sub_i32 s8, 0, s15
+; GFX6-NEXT:    v_mul_lo_u32 v7, s8, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v3, v6, s[6:7]
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v5
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
+; GFX6-NEXT:    s_mul_i32 s0, s0, s15
+; GFX6-NEXT:    s_sub_i32 s0, s11, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s15
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s15
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s15
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v4i32:
@@ -1267,79 +1293,87 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s8
-; GFX9-NEXT:    s_sub_i32 s3, 0, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX9-NEXT:    s_sub_i32 s2, 0, s10
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GFX9-NEXT:    s_sub_i32 s2, 0, s11
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v6
-; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s10, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
-; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s2
+; GFX9-NEXT:    s_add_i32 s3, s3, s2
+; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s3
+; GFX9-NEXT:    s_mul_i32 s3, s2, s8
+; GFX9-NEXT:    s_sub_i32 s3, s4, s3
+; GFX9-NEXT:    s_add_i32 s13, s2, 1
+; GFX9-NEXT:    s_sub_i32 s4, s3, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
+; GFX9-NEXT:    s_cselect_b32 s2, s13, s2
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_add_i32 s4, s2, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s3, 0, s9
+; GFX9-NEXT:    s_mul_i32 s3, s3, s12
+; GFX9-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GFX9-NEXT:    s_add_i32 s12, s12, s3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v2
+; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s12
+; GFX9-NEXT:    s_mul_i32 s4, s3, s9
+; GFX9-NEXT:    s_sub_i32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s8, s3, 1
+; GFX9-NEXT:    s_sub_i32 s5, s4, s9
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s9
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s3, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s9
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GFX9-NEXT:    s_sub_i32 s4, 0, s10
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s5
+; GFX9-NEXT:    s_mul_i32 s5, s4, s10
+; GFX9-NEXT:    s_sub_i32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_sub_i32 s8, s5, s10
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s10
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s10
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s11
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s6, s5, s11
+; GFX9-NEXT:    s_sub_i32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_sub_i32 s8, s6, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s11
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s11
+; GFX9-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
@@ -1476,77 +1510,85 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s12, 0, s8
-; GFX6-NEXT:    s_sub_i32 s13, 0, s9
+; GFX6-NEXT:    s_sub_i32 s2, 0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    s_sub_i32 s4, 0, s10
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    s_sub_i32 s4, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s4, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s9
+; GFX6-NEXT:    s_sub_i32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s10
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s10
+; GFX6-NEXT:    s_sub_i32 s2, s6, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s10
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s10
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s10
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s10
+; GFX6-NEXT:    s_cselect_b32 s6, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s11
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX6-NEXT:    s_mul_i32 s4, s4, s11
+; GFX6-NEXT:    s_sub_i32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s11
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s11
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -1810,124 +1852,136 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ;
 ; GFX6-LABEL: sdiv_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s15, 0xf000
-; GFX6-NEXT:    s_mov_b32 s14, -1
+; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s19, 0xf000
+; GFX6-NEXT:    s_mov_b32 s18, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX6-NEXT:    s_add_i32 s3, s8, s2
+; GFX6-NEXT:    s_ashr_i32 s2, s12, 31
+; GFX6-NEXT:    s_add_i32 s3, s12, s2
 ; GFX6-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
-; GFX6-NEXT:    s_add_i32 s0, s9, s8
-; GFX6-NEXT:    s_xor_b32 s9, s0, s8
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s1, 0, s3
-; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_xor_b32 s2, s0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX6-NEXT:    s_add_i32 s1, s4, s0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s9
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    s_add_i32 s1, s5, s0
-; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    s_xor_b32 s2, s0, s8
-; GFX6-NEXT:    s_add_i32 s0, s10, s3
-; GFX6-NEXT:    s_xor_b32 s4, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s4
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
-; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
-; GFX6-NEXT:    s_add_i32 s5, s11, s2
-; GFX6-NEXT:    s_add_i32 s1, s6, s0
-; GFX6-NEXT:    s_xor_b32 s5, s5, s2
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_ashr_i32 s4, s8, 31
+; GFX6-NEXT:    s_add_i32 s5, s8, s4
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s8, s4, s2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s4, s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s13, 31
+; GFX6-NEXT:    s_add_i32 s5, s13, s4
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT:    s_sub_i32 s6, 0, s5
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[2:3]
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s6, v2
+; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
+; GFX6-NEXT:    s_add_i32 s7, s9, s6
+; GFX6-NEXT:    s_xor_b32 s7, s7, s6
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    s_xor_b32 s9, s6, s4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s5
-; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
-; GFX6-NEXT:    s_xor_b32 s3, s0, s3
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX6-NEXT:    s_mul_i32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s6, s4, s5
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_ashr_i32 s6, s14, 31
+; GFX6-NEXT:    s_add_i32 s7, s14, s6
+; GFX6-NEXT:    s_xor_b32 s7, s7, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s7
+; GFX6-NEXT:    s_sub_i32 s12, 0, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX6-NEXT:    v_xor_b32_e32 v2, s9, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s5
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX6-NEXT:    s_add_i32 s1, s7, s0
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v4
+; GFX6-NEXT:    s_ashr_i32 s12, s10, 31
+; GFX6-NEXT:    s_add_i32 s10, s10, s12
+; GFX6-NEXT:    s_xor_b32 s10, s10, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
-; GFX6-NEXT:    s_xor_b32 s2, s0, s2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s5
+; GFX6-NEXT:    s_xor_b32 s12, s12, s6
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX6-NEXT:    s_mul_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s6, s10, s6
+; GFX6-NEXT:    s_sub_i32 s10, s6, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s7
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX6-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s7
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX6-NEXT:    s_ashr_i32 s10, s15, 31
+; GFX6-NEXT:    s_add_i32 s13, s15, s10
+; GFX6-NEXT:    s_xor_b32 s13, s13, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s13
+; GFX6-NEXT:    s_sub_i32 s0, 0, s13
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX6-NEXT:    v_xor_b32_e32 v4, s12, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v3
+; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX6-NEXT:    s_add_i32 s1, s11, s0
+; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v2
+; GFX6-NEXT:    s_xor_b32 s0, s0, s10
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s1, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s12, v4
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX6-NEXT:    s_mul_i32 s2, s2, s13
+; GFX6-NEXT:    s_sub_i32 s1, s1, s2
+; GFX6-NEXT:    s_sub_i32 s2, s1, s13
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s13
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    s_cselect_b32 s1, s2, s1
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
+; GFX6-NEXT:    s_cmp_ge_u32 s1, s13
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v4i32:
@@ -1940,114 +1994,122 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX9-NEXT:    s_add_i32 s3, s8, s2
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
-; GFX9-NEXT:    s_add_i32 s9, s9, s12
-; GFX9-NEXT:    s_xor_b32 s9, s9, s12
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_sub_i32 s14, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_xor_b32 s2, s8, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s4, s4, s8
+; GFX9-NEXT:    s_sub_i32 s8, 0, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX9-NEXT:    s_mul_i32 s8, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
+; GFX9-NEXT:    s_add_i32 s12, s12, s8
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s12
+; GFX9-NEXT:    s_mul_i32 s12, s8, s3
+; GFX9-NEXT:    s_sub_i32 s4, s4, s12
+; GFX9-NEXT:    s_add_i32 s13, s8, 1
+; GFX9-NEXT:    s_sub_i32 s12, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s8, s13, s8
+; GFX9-NEXT:    s_cselect_b32 s4, s12, s4
+; GFX9-NEXT:    s_add_i32 s12, s8, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s12, s8
+; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_add_i32 s8, s9, s4
+; GFX9-NEXT:    s_xor_b32 s8, s8, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s4, s9, s4
+; GFX9-NEXT:    s_sub_i32 s2, s3, s2
+; GFX9-NEXT:    s_xor_b32 s3, s5, s9
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s5, 0, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s5
+; GFX9-NEXT:    s_add_i32 s9, s9, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s9
+; GFX9-NEXT:    s_mul_i32 s9, s5, s8
+; GFX9-NEXT:    s_sub_i32 s3, s3, s9
+; GFX9-NEXT:    s_add_i32 s12, s5, 1
+; GFX9-NEXT:    s_sub_i32 s9, s3, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s12, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s9, s3
+; GFX9-NEXT:    s_add_i32 s9, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s8
+; GFX9-NEXT:    s_cselect_b32 s3, s9, s5
+; GFX9-NEXT:    s_ashr_i32 s5, s10, 31
+; GFX9-NEXT:    s_add_i32 s8, s10, s5
+; GFX9-NEXT:    s_xor_b32 s8, s8, s5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    s_ashr_i32 s9, s6, 31
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_add_i32 s6, s6, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s5, s9, s5
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s4, s6, s9
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s6, 0, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s6, s9, s6
+; GFX9-NEXT:    s_add_i32 s9, s9, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s9
+; GFX9-NEXT:    s_mul_i32 s9, s6, s8
+; GFX9-NEXT:    s_sub_i32 s4, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s6, 1
+; GFX9-NEXT:    s_sub_i32 s9, s4, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s8
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s8
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX9-NEXT:    s_add_i32 s8, s11, s6
+; GFX9-NEXT:    s_xor_b32 s8, s8, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s8
-; GFX9-NEXT:    s_xor_b32 s4, s4, s8
-; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
+; GFX9-NEXT:    s_add_i32 s7, s7, s2
+; GFX9-NEXT:    s_xor_b32 s6, s2, s6
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s14, 0, s9
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
-; GFX9-NEXT:    s_add_i32 s5, s5, s13
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s13
-; GFX9-NEXT:    s_xor_b32 s2, s8, s2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
-; GFX9-NEXT:    s_add_i32 s4, s10, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    s_xor_b32 s4, s4, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    s_sub_i32 s5, 0, s4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
-; GFX9-NEXT:    s_add_i32 s9, s11, s8
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    s_xor_b32 s9, s9, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
-; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s5
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
-; GFX9-NEXT:    s_xor_b32 s6, s6, s5
-; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX9-NEXT:    s_xor_b32 s2, s13, s12
-; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
-; GFX9-NEXT:    s_xor_b32 s2, s5, s3
-; GFX9-NEXT:    s_sub_i32 s3, 0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
-; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
-; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
-; GFX9-NEXT:    s_add_i32 s5, s7, s3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    s_xor_b32 s2, s3, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
+; GFX9-NEXT:    s_xor_b32 s2, s7, s2
+; GFX9-NEXT:    s_sub_i32 s5, 0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX9-NEXT:    s_mul_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s7
+; GFX9-NEXT:    s_mul_i32 s7, s5, s8
+; GFX9-NEXT:    s_sub_i32 s2, s2, s7
+; GFX9-NEXT:    s_add_i32 s9, s5, 1
+; GFX9-NEXT:    s_sub_i32 s7, s2, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s2
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9-NEXT:    s_sub_i32 s2, s2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <4 x i32> %x, %y
@@ -2216,109 +2278,117 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX6-NEXT:    s_add_i32 s8, s8, s2
-; GFX6-NEXT:    s_xor_b32 s8, s8, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_ashr_i32 s13, s9, 31
-; GFX6-NEXT:    s_add_i32 s9, s9, s13
-; GFX6-NEXT:    s_xor_b32 s9, s9, s13
+; GFX6-NEXT:    s_add_i32 s3, s8, s2
+; GFX6-NEXT:    s_xor_b32 s2, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s14, 0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_add_i32 s4, s4, s12
-; GFX6-NEXT:    s_xor_b32 s4, s4, s12
-; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s14, 0, s9
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, s13
-; GFX6-NEXT:    s_xor_b32 s5, s5, s13
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_ashr_i32 s3, s4, 31
+; GFX6-NEXT:    s_add_i32 s4, s4, s3
+; GFX6-NEXT:    s_xor_b32 s4, s4, s3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    s_mul_i32 s8, s8, s2
+; GFX6-NEXT:    s_sub_i32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s8, s4, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX6-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX6-NEXT:    s_sub_i32 s8, s4, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX6-NEXT:    s_cselect_b32 s2, s8, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX6-NEXT:    s_add_i32 s8, s9, s4
+; GFX6-NEXT:    s_xor_b32 s4, s8, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX6-NEXT:    s_sub_i32 s8, 0, s4
+; GFX6-NEXT:    s_xor_b32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s9, s2, s3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v0
+; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX6-NEXT:    s_add_i32 s5, s5, s8
+; GFX6-NEXT:    s_xor_b32 s5, s5, s8
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s4
+; GFX6-NEXT:    s_sub_i32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
+; GFX6-NEXT:    s_add_i32 s4, s10, s3
+; GFX6-NEXT:    s_xor_b32 s3, s4, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
+; GFX6-NEXT:    s_xor_b32 s2, s2, s8
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
+; GFX6-NEXT:    s_add_i32 s5, s6, s4
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_sub_i32 s6, s2, s8
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s3
+; GFX6-NEXT:    s_sub_i32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s5, s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s5, s2, s3
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX6-NEXT:    s_cselect_b32 s5, s5, s2
+; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX6-NEXT:    s_add_i32 s3, s11, s2
+; GFX6-NEXT:    s_xor_b32 s8, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s10, 0, s8
+; GFX6-NEXT:    s_xor_b32 s5, s5, s4
+; GFX6-NEXT:    s_sub_i32 s4, s5, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s9
+; GFX6-NEXT:    s_ashr_i32 s9, s7, 31
+; GFX6-NEXT:    s_add_i32 s7, s7, s9
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    s_xor_b32 s7, s7, s9
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    s_ashr_i32 s4, s10, 31
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    s_add_i32 s8, s10, s4
-; GFX6-NEXT:    s_xor_b32 s4, s8, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s4
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
-; GFX6-NEXT:    s_sub_i32 s5, 0, s4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
-; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX6-NEXT:    s_add_i32 s9, s11, s8
-; GFX6-NEXT:    s_ashr_i32 s5, s6, 31
-; GFX6-NEXT:    s_xor_b32 s8, s9, s8
-; GFX6-NEXT:    s_add_i32 s6, s6, s5
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    s_xor_b32 s6, s6, s5
-; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s4
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX6-NEXT:    s_sub_i32 s6, 0, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
-; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
-; GFX6-NEXT:    s_add_i32 s7, s7, s6
-; GFX6-NEXT:    s_xor_b32 s7, s7, s6
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v2, s5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s5, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s6
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    s_mul_i32 s5, s5, s8
+; GFX6-NEXT:    s_sub_i32 s5, s7, s5
+; GFX6-NEXT:    s_sub_i32 s6, s5, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_sub_i32 s6, s5, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s8
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_xor_b32 s5, s5, s9
+; GFX6-NEXT:    s_sub_i32 s5, s5, s9
+; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -2993,70 +3063,70 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
-; GFX6-NEXT:    s_or_b32 s8, s8, 1
+; GFX6-NEXT:    s_or_b32 s10, s8, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s8, s10, 0
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX6-NEXT:    s_sext_i32_i16 s6, s7
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
-; GFX6-NEXT:    s_sext_i32_i16 s4, s7
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    s_sext_i32_i16 s6, s5
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_xor_b32 s4, s6, s4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s8, v2
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    s_sext_i32_i16 s4, s5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
+; GFX6-NEXT:    v_mad_f32 v1, -v4, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    s_ashr_i32 s6, s7, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
+; GFX6-NEXT:    s_ashr_i32 s4, s5, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
+; GFX6-NEXT:    v_mad_f32 v4, -v5, v0, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -3264,80 +3334,80 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
-; GFX6-NEXT:    s_or_b32 s8, s8, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NEXT:    s_or_b32 s10, s8, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s8, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s8, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
-; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_ashr_i32 s8, s6, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GFX6-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT:    s_xor_b32 s4, s4, s9
+; GFX6-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v1|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s6, s4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mov_b32_e32 v5, s4
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v2|
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    s_ashr_i32 s4, s7, 16
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    s_ashr_i32 s6, s7, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    s_lshr_b32 s6, s7, 16
+; GFX6-NEXT:    s_lshr_b32 s8, s7, 16
 ; GFX6-NEXT:    s_ashr_i32 s7, s5, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GFX6-NEXT:    s_xor_b32 s4, s7, s4
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    s_xor_b32 s6, s7, s6
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
 ; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_mov_b32_e32 v6, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX6-NEXT:    s_lshr_b32 s4, s5, 16
+; GFX6-NEXT:    s_or_b32 s9, s6, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v4|, |v2|
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT:    s_cselect_b32 s6, s9, 0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s6, v5
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3618,15 +3688,15 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s4
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3696,22 +3766,22 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_i32 s2, s4, 0x30008
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX6-NEXT:    s_bfe_i32 s5, s4, 0x30000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    s_xor_b32 s2, s5, s2
+; GFX6-NEXT:    s_bfe_i32 s3, s4, 0x30000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX6-NEXT:    s_xor_b32 s2, s3, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX6-NEXT:    s_or_b32 s2, s2, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    s_lshr_b32 s5, s4, 8
+; GFX6-NEXT:    s_or_b32 s6, s2, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX6-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -4184,53 +4254,53 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
-; GFX6-NEXT:    s_or_b32 s8, s8, 1
+; GFX6-NEXT:    s_or_b32 s10, s8, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s8, s10, 0
+; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GFX6-NEXT:    v_mov_b32_e32 v3, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s8, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    s_sext_i32_i16 s6, s7
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
-; GFX6-NEXT:    s_sext_i32_i16 s4, s7
+; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    s_sext_i32_i16 s5, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v0|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
+; GFX6-NEXT:    s_sext_i32_i16 s4, s5
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v0, v3
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v3i16:
@@ -4394,37 +4464,37 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
-; GFX6-NEXT:    s_or_b32 s8, s8, 1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NEXT:    s_or_b32 s10, s8, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    s_ashr_i32 s9, s6, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s8, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s8, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
-; GFX6-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
+; GFX6-NEXT:    s_ashr_i32 s8, s6, 16
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
+; GFX6-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GFX6-NEXT:    s_xor_b32 s4, s4, s9
+; GFX6-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mov_b32_e32 v4, s4
+; GFX6-NEXT:    s_or_b32 s4, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v1|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s6
 ; GFX6-NEXT:    s_sext_i32_i16 s6, s5
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s6
@@ -4436,12 +4506,12 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v2|
+; GFX6-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s8, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s10, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -4826,7 +4896,7 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s6, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
@@ -4997,54 +5067,54 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s4
-; GFX6-NEXT:    s_bfe_i32 s4, s8, 0xf000f
+; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
+; GFX6-NEXT:    s_or_b32 s7, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GFX6-NEXT:    s_bfe_i32 s5, s6, 0xf000f
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX6-NEXT:    s_bfe_i32 s5, s8, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v4
+; GFX6-NEXT:    s_bfe_i32 s4, s6, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX6-NEXT:    v_alignbit_b32 v1, s9, v1, 30
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    s_or_b32 s4, s4, 1
-; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
-; GFX6-NEXT:    v_mov_b32_e32 v6, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s4, v5
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
+; GFX6-NEXT:    v_mad_f32 v5, -v1, v2, v5
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -5216,52 +5286,53 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_bfe_i32 s4, s8, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_bfe_i32 s5, s8, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
-; GFX6-NEXT:    v_mov_b32_e32 v2, s8
-; GFX6-NEXT:    v_alignbit_b32 v2, s9, v2, 30
-; GFX6-NEXT:    s_bfe_i32 s9, s6, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s9
+; GFX6-NEXT:    s_bfe_i32 s5, s6, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX6-NEXT:    s_xor_b32 s5, s9, s5
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX6-NEXT:    s_or_b32 s5, s5, 1
+; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
 ; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    v_mad_f32 v5, -v6, v4, v5
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, s5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
+; GFX6-NEXT:    s_lshr_b32 s7, s6, 15
+; GFX6-NEXT:    v_alignbit_b32 v2, s9, v2, 30
+; GFX6-NEXT:    s_lshr_b32 s9, s8, 15
+; GFX6-NEXT:    s_or_b32 s10, s4, 1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v5|, |v4|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s4, v6
+; GFX6-NEXT:    s_bfe_i32 s4, s8, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s4
+; GFX6-NEXT:    s_bfe_i32 s5, s6, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s8
-; GFX6-NEXT:    s_bfe_i32 s5, s8, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s5
-; GFX6-NEXT:    s_mov_b32 s0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    s_lshr_b32 s4, s6, 15
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s6, v4
-; GFX6-NEXT:    s_bfe_i32 s6, s6, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
-; GFX6-NEXT:    s_xor_b32 s5, s6, s5
-; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
 ; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
 ; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s6, v4
+; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
-; GFX6-NEXT:    s_or_b32 s5, s5, 1
-; GFX6-NEXT:    v_alignbit_b32 v0, s7, v0, 30
-; GFX6-NEXT:    v_mov_b32_e32 v8, s5
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
+; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, s4, v7
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
@@ -5272,12 +5343,11 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
-; GFX6-NEXT:    s_lshr_b32 s7, s8, 15
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s7
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s9
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v5
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
@@ -5655,95 +5725,104 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s7
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_sub_i32 s0, 0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
-; GFX6-NEXT:    s_sub_i32 s0, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX6-NEXT:    s_mul_i32 s3, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s4, s3
+; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX6-NEXT:    s_sub_i32 s4, 0, s6
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX6-NEXT:    s_mul_i32 s0, s0, s6
+; GFX6-NEXT:    s_sub_i32 s0, s5, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s6
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s6
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s6
-; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_i32 s2, 0, s6
-; GFX9-NEXT:    s_sub_i32 s3, 0, s7
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s7
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s6, v3
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s7
+; GFX9-NEXT:    s_mul_i32 s7, s6, s3
+; GFX9-NEXT:    s_sub_i32 s4, s4, s7
+; GFX9-NEXT:    s_add_i32 s9, s6, 1
+; GFX9-NEXT:    s_sub_i32 s7, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s6
+; GFX9-NEXT:    s_sub_i32 s4, 0, s2
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_add_i32 s8, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s8
+; GFX9-NEXT:    s_mul_i32 s6, s4, s2
+; GFX9-NEXT:    s_sub_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s7, s4, 1
+; GFX9-NEXT:    s_sub_i32 s6, s5, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s4, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s6, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -5981,45 +6060,49 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
+; GFX6-NEXT:    s_lshl_b32 s6, 0x1000, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s7
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX6-NEXT:    s_mul_i32 s3, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s4, s3
+; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s4, s4, s3
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s6
+; GFX6-NEXT:    s_sub_i32 s5, s5, s7
+; GFX6-NEXT:    s_sub_i32 s7, s5, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s6
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_sub_i32 s7, s5, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s6
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -6181,7 +6264,6 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_xor_b32 s2, s2, s9
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    s_xor_b32 s8, s9, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
@@ -6189,56 +6271,62 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s0, s2, s0
+; GFX6-NEXT:    s_sub_i32 s1, s0, s3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_xor_b32 s0, s9, s8
+; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s5, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
 ; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s5
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    s_xor_b32 s2, s5, s4
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s7
+; GFX9-NEXT:    s_mul_i32 s8, s6, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_sub_i32 s8, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX9-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s6
+; GFX9-NEXT:    s_xor_b32 s3, s5, s4
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -6446,148 +6534,158 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ;
 ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s10
-; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX6-NEXT:    s_add_i32 s0, s0, s1
-; GFX6-NEXT:    s_xor_b32 s2, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
+; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, s3
+; GFX6-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_lshl_b32 s0, 0x1000, s11
-; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
-; GFX6-NEXT:    s_add_i32 s0, s0, s3
+; GFX6-NEXT:    s_sub_i32 s6, 0, s2
+; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s11, 0, s2
-; GFX6-NEXT:    s_xor_b32 s10, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_add_i32 s8, s8, s0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s11, v0
-; GFX6-NEXT:    s_xor_b32 s8, s8, s0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    s_xor_b32 s11, s0, s1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s10
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX6-NEXT:    s_add_i32 s1, s9, s0
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    s_xor_b32 s2, s0, s3
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, s11, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s11, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX6-NEXT:    s_add_i32 s4, s4, s6
+; GFX6-NEXT:    s_xor_b32 s4, s4, s6
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s6, s6, s3
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX6-NEXT:    s_mul_i32 s3, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s4, s3
+; GFX6-NEXT:    s_sub_i32 s4, s3, s2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX6-NEXT:    s_add_i32 s7, s7, s4
+; GFX6-NEXT:    s_xor_b32 s7, s7, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
+; GFX6-NEXT:    s_sub_i32 s8, 0, s7
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v2
+; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX6-NEXT:    s_add_i32 s5, s5, s8
+; GFX6-NEXT:    s_xor_b32 s5, s5, s8
+; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v3
+; GFX6-NEXT:    s_xor_b32 s4, s8, s4
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX6-NEXT:    s_mul_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_sub_i32 s6, s5, s7
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s6
-; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
-; GFX9-NEXT:    s_add_i32 s0, s0, s1
-; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s6
+; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s7
-; GFX9-NEXT:    s_ashr_i32 s8, s6, 31
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s6, s6, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT:    s_sub_i32 s10, 0, s0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
 ; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s10, 0, s6
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s3, s7, s3
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s7
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v1
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
-; GFX9-NEXT:    s_xor_b32 s5, s5, s9
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s4, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s5, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    s_xor_b32 s1, s7, s1
-; GFX9-NEXT:    s_xor_b32 s0, s9, s8
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s1, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
-; GFX9-NEXT:    s_endpgm
-  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
-  %r = sdiv <2 x i32> %x, %shl.y
-  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
-  ret void
-}
-
-define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
-; CHECK-LABEL: @srem_i32_oddk_denom(
-; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; GFX9-NEXT:    s_sub_i32 s7, 0, s2
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s8
+; GFX9-NEXT:    s_mul_i32 s8, s7, s2
+; GFX9-NEXT:    s_sub_i32 s4, s4, s8
+; GFX9-NEXT:    s_add_i32 s9, s7, 1
+; GFX9-NEXT:    s_sub_i32 s8, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX9-NEXT:    s_add_i32 s8, s7, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s8, s7
+; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s4
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s4, s7, s4
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s3, s5, s7
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s5, 0, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s7
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_sub_i32 s3, s3, s7
+; GFX9-NEXT:    s_add_i32 s8, s5, 1
+; GFX9-NEXT:    s_sub_i32 s7, s3, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX9-NEXT:    s_add_i32 s7, s5, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX9-NEXT:    s_cselect_b32 s3, s7, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
+  %r = sdiv <2 x i32> %x, %shl.y
+  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
+; CHECK-LABEL: @srem_i32_oddk_denom(
+; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
+; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i32_oddk_denom:
@@ -6699,18 +6797,20 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s4
+; GFX6-NEXT:    s_sub_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s7, s6, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX6-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX6-NEXT:    s_sub_i32 s7, s6, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s6
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -6903,57 +7003,61 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s6
 ; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX6-NEXT:    s_add_i32 s2, s2, s3
-; GFX6-NEXT:    s_xor_b32 s6, s2, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_lshl_b32 s7, 0x1000, s7
-; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX6-NEXT:    s_add_i32 s7, s7, s8
+; GFX6-NEXT:    s_xor_b32 s2, s2, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
+; GFX6-NEXT:    s_ashr_i32 s6, s4, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s7, s7, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_i32 s9, 0, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX6-NEXT:    s_add_i32 s4, s4, s8
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    s_xor_b32 s4, s4, s8
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    s_sub_i32 s9, 0, s7
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    s_add_i32 s3, s4, s6
+; GFX6-NEXT:    s_xor_b32 s3, s3, s6
+; GFX6-NEXT:    s_lshl_b32 s4, 0x1000, s7
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s2
+; GFX6-NEXT:    s_sub_i32 s3, s3, s7
+; GFX6-NEXT:    s_sub_i32 s7, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX6-NEXT:    s_sub_i32 s7, s3, s2
+; GFX6-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX6-NEXT:    s_cselect_b32 s7, s7, s3
+; GFX6-NEXT:    s_ashr_i32 s2, s4, 31
+; GFX6-NEXT:    s_add_i32 s4, s4, s2
+; GFX6-NEXT:    s_xor_b32 s4, s4, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX6-NEXT:    s_sub_i32 s2, 0, s4
+; GFX6-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX6-NEXT:    s_xor_b32 s7, s7, s6
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    s_sub_i32 s6, s7, s6
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_add_i32 s2, s5, s8
+; GFX6-NEXT:    s_xor_b32 s5, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
-; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, s9
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    s_xor_b32 s4, s5, s9
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX6-NEXT:    s_mul_i32 s7, s7, s4
+; GFX6-NEXT:    s_sub_i32 s5, s5, s7
+; GFX6-NEXT:    s_sub_i32 s7, s5, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_sub_i32 s7, s5, s4
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s7, s5
+; GFX6-NEXT:    s_xor_b32 s4, s4, s8
+; GFX6-NEXT:    s_sub_i32 s4, s4, s8
+; GFX6-NEXT:    v_mov_b32_e32 v0, s6
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -7079,9 +7183,9 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7133,12 +7237,13 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
@@ -7148,9 +7253,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -7256,33 +7360,33 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_add_u32 s6, s3, 2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
-; GFX9-NEXT:    s_add_u32 s9, s3, 1
-; GFX9-NEXT:    s_addc_u32 s1, s2, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s7, s7, s8
-; GFX9-NEXT:    s_cmpk_gt_u32 s7, 0x11e
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    s_add_u32 s0, s3, 1
+; GFX9-NEXT:    s_addc_u32 s6, s2, 0
+; GFX9-NEXT:    s_add_u32 s1, s3, 2
+; GFX9-NEXT:    s_addc_u32 s9, s2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s8
+; GFX9-NEXT:    s_cmpk_gt_u32 s0, 0x11e
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
-; GFX9-NEXT:    s_cmpk_eq_i32 s7, 0x11f
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s[0:1]
+; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x11f
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
@@ -7431,7 +7535,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    s_movk_i32 s6, 0xf001
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_movk_i32 s8, 0xfff
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -7439,8 +7543,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 12
-; GFX6-NEXT:    s_movk_i32 s0, 0xfff
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
@@ -7498,143 +7602,159 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s8
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s8
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
-; GFX6-NEXT:    s_movk_i32 s0, 0xffe
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    s_movk_i32 s2, 0xffe
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s2, 0xf001
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s2
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0xffe
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    s_mul_hi_u32 s5, s4, 0xfffff001
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    s_sub_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_i32 s9, s8, 0xfffff001
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_mul_i32 s11, s4, 0xfffff001
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s5
+; GFX9-NEXT:    s_mul_i32 s10, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s11
+; GFX9-NEXT:    s_mul_i32 s11, s8, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX9-NEXT:    s_addc_u32 s4, s9, s12
+; GFX9-NEXT:    s_addc_u32 s9, s10, 0
+; GFX9-NEXT:    s_mul_i32 s5, s8, s5
+; GFX9-NEXT:    s_add_u32 s4, s4, s5
+; GFX9-NEXT:    s_addc_u32 s5, 0, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s4, s8, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, 0xfffff001
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xfffff001
+; GFX9-NEXT:    s_sub_i32 s9, s9, s8
+; GFX9-NEXT:    s_add_i32 s9, s9, s5
+; GFX9-NEXT:    s_mul_i32 s11, s8, 0xfffff001
+; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s9
+; GFX9-NEXT:    s_mul_i32 s10, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s8, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s10
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s11
+; GFX9-NEXT:    s_mul_i32 s11, s4, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s9
+; GFX9-NEXT:    s_addc_u32 s5, s5, s12
+; GFX9-NEXT:    s_addc_u32 s8, s10, 0
+; GFX9-NEXT:    s_mul_i32 s9, s4, s9
+; GFX9-NEXT:    s_add_u32 s5, s5, s9
+; GFX9-NEXT:    s_addc_u32 s8, 0, s8
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s4, s4, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s8, s6, s4
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s4
+; GFX9-NEXT:    s_add_u32 s8, s10, s8
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s9
+; GFX9-NEXT:    s_mul_i32 s9, s7, s9
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s10, s7, s4
+; GFX9-NEXT:    s_addc_u32 s5, s5, s11
+; GFX9-NEXT:    s_addc_u32 s8, s10, 0
+; GFX9-NEXT:    s_mul_i32 s4, s7, s4
+; GFX9-NEXT:    s_add_u32 s4, s5, s4
+; GFX9-NEXT:    s_addc_u32 s5, 0, s8
+; GFX9-NEXT:    s_add_u32 s8, s4, 1
+; GFX9-NEXT:    s_addc_u32 s9, s5, 0
+; GFX9-NEXT:    s_add_u32 s10, s4, 2
+; GFX9-NEXT:    s_mul_i32 s13, s5, 0xfff
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, 0xfff
+; GFX9-NEXT:    s_addc_u32 s11, s5, 0
+; GFX9-NEXT:    s_add_i32 s14, s14, s13
+; GFX9-NEXT:    s_mul_i32 s13, s4, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    s_movk_i32 s12, 0xfff
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s7, s14
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s7, s6, 0
+; GFX9-NEXT:    s_movk_i32 s12, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s12, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
@@ -7806,19 +7926,19 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v5, s7
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v4, s7
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -7914,40 +8034,40 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    s_mov_b32 s8, 0x9761f7c9
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
 ; GFX9-NEXT:    s_subb_u32 s6, s1, 0x11f
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v0
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s8, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_subb_u32 s10, s6, 0
 ; GFX9-NEXT:    s_cmpk_gt_u32 s10, 0x11e
 ; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s12, v3
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s12, v1
 ; GFX9-NEXT:    s_cmpk_eq_i32 s10, 0x11f
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
 ; GFX9-NEXT:    s_subb_u32 s2, s6, 0x11f
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v3
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v1
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s2, 0
+; GFX9-NEXT:    s_subb_u32 s2, s2, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s7, s9
-; GFX9-NEXT:    s_cmpk_gt_u32 s2, 0x11e
-; GFX9-NEXT:    v_mov_b32_e32 v5, s10
-; GFX9-NEXT:    v_mov_b32_e32 v6, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s9
+; GFX9-NEXT:    s_cmpk_gt_u32 s0, 0x11e
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s12, v0
-; GFX9-NEXT:    s_cmpk_eq_i32 s2, 0x11f
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x11f
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -8157,9 +8277,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s5, 0xffed2705
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s8, 0xffed2705
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -8167,15 +8287,15 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s8
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s5
-; GFX6-NEXT:    s_mov_b32 s9, s8
-; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s4, 0x12d8fb
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s8
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s8
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -8185,8 +8305,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
@@ -8194,12 +8312,10 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s8
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
@@ -8214,176 +8330,196 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    s_add_u32 s6, s6, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s4
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s7
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s6, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
-; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    s_mov_b32 s4, 0x12d8fa
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4996c7d8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, 0xffed2705
+; GFX9-NEXT:    s_mul_i32 s7, s4, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_sub_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_i32 s9, s5, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s6
+; GFX9-NEXT:    s_mul_i32 s8, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s9
+; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s9
+; GFX9-NEXT:    s_mul_i32 s9, s4, s9
+; GFX9-NEXT:    s_add_u32 s5, s5, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_addc_u32 s5, s7, s10
+; GFX9-NEXT:    s_addc_u32 s7, s8, 0
+; GFX9-NEXT:    s_mul_i32 s6, s4, s6
+; GFX9-NEXT:    s_add_u32 s5, s5, s6
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s4, s4, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_sub_i32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
+; GFX9-NEXT:    s_mul_i32 s12, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_addc_u32 s8, 0, s11
+; GFX9-NEXT:    s_add_u32 s6, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
+; GFX9-NEXT:    s_addc_u32 s6, s8, s9
+; GFX9-NEXT:    s_addc_u32 s7, s7, 0
+; GFX9-NEXT:    s_mul_i32 s5, s4, s5
+; GFX9-NEXT:    s_add_u32 s5, s6, s5
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s6, s4, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX9-NEXT:    s_add_u32 s0, s6, s2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s1, s7, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT:    s_endpgm
-  %r = sdiv i64 %x, 1235195
-  store i64 %r, i64 addrspace(1)* %out
-  ret void
-}
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s2, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_add_u32 s7, s9, s7
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT:    s_mul_i32 s8, s3, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s6
+; GFX9-NEXT:    s_addc_u32 s5, s5, s10
+; GFX9-NEXT:    s_addc_u32 s7, s9, 0
+; GFX9-NEXT:    s_mul_i32 s6, s3, s6
+; GFX9-NEXT:    s_add_u32 s5, s5, s6
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    s_add_u32 s7, s5, 1
+; GFX9-NEXT:    s_addc_u32 s8, s6, 0
+; GFX9-NEXT:    s_add_u32 s9, s5, 2
+; GFX9-NEXT:    s_mul_i32 s12, s6, 0x12d8fb
+; GFX9-NEXT:    s_mul_hi_u32 s13, s5, 0x12d8fb
+; GFX9-NEXT:    s_addc_u32 s10, s6, 0
+; GFX9-NEXT:    s_add_i32 s13, s13, s12
+; GFX9-NEXT:    s_mul_i32 s12, s5, 0x12d8fb
+; GFX9-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    s_mov_b32 s11, 0x12d8fb
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s2, s3, s13
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s11, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s3, s2, 0
+; GFX9-NEXT:    s_mov_b32 s11, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s11, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s11, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v4, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+  %r = sdiv i64 %x, 1235195
+  store i64 %r, i64 addrspace(1)* %out
+  ret void
+}
 
 define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
@@ -8493,9 +8629,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -8545,12 +8681,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
@@ -8560,10 +8697,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s1
@@ -8688,31 +8824,31 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_add_u32 s6, s12, 2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-; GFX9-NEXT:    s_addc_u32 s0, s13, 0
-; GFX9-NEXT:    s_add_u32 s15, s12, 1
-; GFX9-NEXT:    s_addc_u32 s1, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s7, s7, s14
-; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, 1
+; GFX9-NEXT:    s_addc_u32 s6, s13, 0
+; GFX9-NEXT:    s_add_u32 s1, s12, 2
+; GFX9-NEXT:    s_addc_u32 s15, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_mov_b32_e32 v4, s15
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s14
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s14
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
@@ -8825,10 +8961,10 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GFX6-NEXT:    s_add_u32 s0, s0, s8
 ; GFX6-NEXT:    s_addc_u32 s1, s1, 0
-; GFX6-NEXT:    s_ashr_i64 s[8:9], s[0:1], 12
+; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -8838,8 +8974,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX6-NEXT:    s_add_u32 s0, s2, s10
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
@@ -8849,9 +8985,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
-; GFX6-NEXT:    s_mov_b32 s11, s10
-; GFX6-NEXT:    s_addc_u32 s1, s3, s10
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
@@ -8871,180 +9007,195 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    s_movk_i32 s2, 0xfff
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    s_movk_i32 s9, 0xfff
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s2
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s9
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s9
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s3
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
-; GFX6-NEXT:    s_movk_i32 s0, 0xffe
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    s_movk_i32 s2, 0xffe
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s10
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s8
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s9
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s8, 0xf001
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x457ff000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f800000
+; GFX9-NEXT:    v_mac_f32_e32 v1, 0, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
 ; GFX9-NEXT:    s_addc_u32 s1, s5, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX9-NEXT:    s_add_u32 s0, s6, s8
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_addc_u32 s1, s7, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX9-NEXT:    s_movk_i32 s6, 0xfff
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s6
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s6
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
-; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0xffe
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX9-NEXT:    s_mul_hi_u32 s5, s4, 0xfffff001
+; GFX9-NEXT:    s_mul_i32 s9, s8, 0xfffff001
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_sub_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_i32 s11, s4, 0xfffff001
+; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s5
+; GFX9-NEXT:    s_mul_i32 s10, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s4, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s11
+; GFX9-NEXT:    s_mul_i32 s11, s8, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX9-NEXT:    s_addc_u32 s4, s9, s12
+; GFX9-NEXT:    s_addc_u32 s9, s10, 0
+; GFX9-NEXT:    s_mul_i32 s5, s8, s5
+; GFX9-NEXT:    s_add_u32 s4, s4, s5
+; GFX9-NEXT:    s_addc_u32 s5, 0, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s4, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s4, s8, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xfffff001
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, 0xfffff001
+; GFX9-NEXT:    s_add_i32 s9, s9, s5
+; GFX9-NEXT:    s_sub_i32 s5, s9, s8
+; GFX9-NEXT:    s_mul_i32 s10, s8, 0xfffff001
+; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s5
+; GFX9-NEXT:    s_mul_i32 s14, s8, s5
+; GFX9-NEXT:    s_mul_hi_u32 s8, s8, s10
+; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s10
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_addc_u32 s10, 0, s13
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s5
+; GFX9-NEXT:    s_addc_u32 s8, s10, s11
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    s_mul_i32 s5, s4, s5
+; GFX9-NEXT:    s_add_u32 s5, s8, s5
+; GFX9-NEXT:    s_addc_u32 s8, 0, s9
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s8, s4, s8
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX9-NEXT:    s_add_u32 s6, s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s7, s7, s4
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    s_mul_i32 s9, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s8
+; GFX9-NEXT:    s_add_u32 s9, s11, s9
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s12, s7, s10
+; GFX9-NEXT:    s_mul_i32 s10, s7, s10
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s8
+; GFX9-NEXT:    s_addc_u32 s5, s5, s12
+; GFX9-NEXT:    s_addc_u32 s9, s11, 0
+; GFX9-NEXT:    s_mul_i32 s8, s7, s8
+; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_addc_u32 s8, 0, s9
+; GFX9-NEXT:    s_add_u32 s9, s5, 1
+; GFX9-NEXT:    s_addc_u32 s10, s8, 0
+; GFX9-NEXT:    s_add_u32 s11, s5, 2
+; GFX9-NEXT:    s_mul_i32 s14, s8, 0xfff
+; GFX9-NEXT:    s_mul_hi_u32 s15, s5, 0xfff
+; GFX9-NEXT:    s_addc_u32 s12, s8, 0
+; GFX9-NEXT:    s_add_i32 s15, s15, s14
+; GFX9-NEXT:    s_mul_i32 s14, s5, 0xfff
+; GFX9-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
+; GFX9-NEXT:    s_movk_i32 s13, 0xfff
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s7, s15
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s13, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s7, s6, 0
+; GFX9-NEXT:    s_movk_i32 s13, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s13, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_mov_b32_e32 v4, s12
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s13, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s4, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -9099,7 +9250,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v0
-; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -9160,9 +9310,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s13
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -9174,72 +9324,71 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[16:17], s[14:15]
+; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s4
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
-; GFX6-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX6-NEXT:    v_rcp_f32_e32 v3, v8
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v6
+; GFX6-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    s_sub_u32 s0, 0, s2
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    s_subb_u32 s1, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
-; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
+; GFX6-NEXT:    s_sub_u32 s12, 0, s2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v3
+; GFX6-NEXT:    s_subb_u32 s13, 0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
+; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s15, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
@@ -9254,12 +9403,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    s_add_u32 s0, s6, s12
+; GFX6-NEXT:    s_add_u32 s6, s6, s12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    s_addc_u32 s1, s7, s12
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s7, s7, s12
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[12:13]
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
@@ -9269,7 +9420,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v7, s7, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX6-NEXT:    v_mov_b32_e32 v6, s15
+; GFX6-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v2, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
@@ -9277,12 +9428,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s7, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s6, v5
@@ -9295,12 +9446,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
+; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
-; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
+; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 2, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v7, v9, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v8, v10, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v4
@@ -9310,10 +9462,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s1
@@ -9439,36 +9590,36 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_add_u32 s4, s16, 2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GFX9-NEXT:    s_addc_u32 s0, s17, 0
-; GFX9-NEXT:    s_add_u32 s19, s16, 1
-; GFX9-NEXT:    s_addc_u32 s1, s17, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s5, s5, s18
-; GFX9-NEXT:    s_cmp_ge_u32 s5, s13
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    s_add_u32 s0, s16, 1
+; GFX9-NEXT:    s_addc_u32 s4, s17, 0
+; GFX9-NEXT:    s_add_u32 s1, s16, 2
+; GFX9-NEXT:    s_addc_u32 s19, s17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s5, s18
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX9-NEXT:    s_add_u32 s8, s10, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
@@ -9579,31 +9730,31 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_add_u32 s6, s12, 2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
-; GFX9-NEXT:    s_addc_u32 s0, s13, 0
-; GFX9-NEXT:    s_add_u32 s15, s12, 1
-; GFX9-NEXT:    s_addc_u32 s1, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s7, s7, s14
-; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, 1
+; GFX9-NEXT:    s_addc_u32 s6, s13, 0
+; GFX9-NEXT:    s_add_u32 s1, s12, 2
+; GFX9-NEXT:    s_addc_u32 s15, s13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
-; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v6, s15
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s14
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
@@ -9632,23 +9783,19 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xffed2705
-; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s8, 0xffed2705
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s8
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GFX6-NEXT:    s_mov_b32 s9, s8
-; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s8
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -9660,8 +9807,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
@@ -9669,11 +9814,10 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s8
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s8
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v3
@@ -9688,57 +9832,64 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v7, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_ashr_i32 s8, s7, 31
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
+; GFX6-NEXT:    s_add_u32 s0, s6, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s1, s7, s8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
+; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    s_mov_b32 s0, s4
+; GFX6-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    s_mov_b32 s1, s5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, s7
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
 ; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v2
 ; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
-; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
+; GFX6-NEXT:    s_mov_b32 s4, 0x12d8fa
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: srem_i64_oddk_denom:
@@ -9747,7 +9898,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -9755,115 +9906,115 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffed2705
-; GFX9-NEXT:    s_mul_i32 s3, s0, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_sub_i32 s2, s2, s1
-; GFX9-NEXT:    s_mul_i32 s9, s1, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s2
-; GFX9-NEXT:    s_mul_i32 s8, s1, s2
-; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s9
-; GFX9-NEXT:    s_add_u32 s1, s1, s8
-; GFX9-NEXT:    s_addc_u32 s3, 0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
-; GFX9-NEXT:    s_mul_i32 s9, s0, s9
-; GFX9-NEXT:    s_add_u32 s1, s1, s9
-; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s3, s10
-; GFX9-NEXT:    s_addc_u32 s3, s8, 0
-; GFX9-NEXT:    s_mul_i32 s2, s0, s2
-; GFX9-NEXT:    s_add_u32 s1, s1, s2
-; GFX9-NEXT:    s_addc_u32 s2, 0, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, 0xffed2705
+; GFX9-NEXT:    s_mul_i32 s7, s4, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_sub_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_i32 s9, s5, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s5, s6
+; GFX9-NEXT:    s_mul_i32 s8, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s5, s9
+; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s10, s4, s9
+; GFX9-NEXT:    s_mul_i32 s9, s4, s9
+; GFX9-NEXT:    s_add_u32 s5, s5, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_addc_u32 s5, s7, s10
+; GFX9-NEXT:    s_addc_u32 s7, s8, 0
+; GFX9-NEXT:    s_mul_i32 s6, s4, s6
+; GFX9-NEXT:    s_add_u32 s5, s5, s6
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s0, s0, s2
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
-; GFX9-NEXT:    s_add_i32 s3, s3, s1
-; GFX9-NEXT:    s_sub_i32 s1, s3, s2
-; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
-; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
-; GFX9-NEXT:    s_mul_i32 s12, s2, s1
-; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
-; GFX9-NEXT:    s_add_u32 s2, s2, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
-; GFX9-NEXT:    s_mul_i32 s10, s0, s8
+; GFX9-NEXT:    s_addc_u32 s4, s4, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s4, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_sub_i32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s8, s6, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s5
+; GFX9-NEXT:    s_mul_i32 s12, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s6, s6, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s8
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s11
-; GFX9-NEXT:    s_add_u32 s2, s2, s10
-; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
-; GFX9-NEXT:    s_addc_u32 s2, s8, s9
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    s_mul_i32 s1, s0, s1
-; GFX9-NEXT:    s_add_u32 s1, s2, s1
-; GFX9-NEXT:    s_addc_u32 s2, 0, s3
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_add_u32 s6, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s5
+; GFX9-NEXT:    s_addc_u32 s6, s8, s9
+; GFX9-NEXT:    s_addc_u32 s7, s7, 0
+; GFX9-NEXT:    s_mul_i32 s5, s4, s5
+; GFX9-NEXT:    s_add_u32 s5, s6, s5
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_addc_u32 s8, s0, s2
+; GFX9-NEXT:    s_addc_u32 s6, s4, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX9-NEXT:    s_add_u32 s0, s6, s2
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s1, s7, s2
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX9-NEXT:    s_mul_i32 s6, s0, s8
-; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s7
-; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s8
-; GFX9-NEXT:    s_add_u32 s6, s9, s6
-; GFX9-NEXT:    s_addc_u32 s3, 0, s3
-; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s7
-; GFX9-NEXT:    s_mul_i32 s7, s1, s7
-; GFX9-NEXT:    s_add_u32 s6, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s9, s1, s8
-; GFX9-NEXT:    s_addc_u32 s3, s3, s10
-; GFX9-NEXT:    s_addc_u32 s6, s9, 0
-; GFX9-NEXT:    s_mul_i32 s7, s1, s8
-; GFX9-NEXT:    s_add_u32 s3, s3, s7
-; GFX9-NEXT:    s_addc_u32 s6, 0, s6
-; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x12d8fb
-; GFX9-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s2, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_add_u32 s7, s9, s7
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT:    s_mul_i32 s8, s3, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s3, s6
+; GFX9-NEXT:    s_addc_u32 s5, s5, s10
+; GFX9-NEXT:    s_addc_u32 s7, s9, 0
+; GFX9-NEXT:    s_mul_i32 s6, s3, s6
+; GFX9-NEXT:    s_add_u32 s5, s5, s6
+; GFX9-NEXT:    s_addc_u32 s6, 0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s8, s5, 0x12d8fb
+; GFX9-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
 ; GFX9-NEXT:    s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    s_add_i32 s8, s8, s6
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0x12d8fb
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s3, s1, s8
+; GFX9-NEXT:    s_subb_u32 s2, s3, s8
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s7, v0
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s0, s3, 0
+; GFX9-NEXT:    s_subb_u32 s3, s2, 0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s7, v1
 ; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s1, s0, 0
+; GFX9-NEXT:    s_subb_u32 s5, s3, 0
 ; GFX9-NEXT:    s_mov_b32 s6, 0x12d8fa
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v5, s0
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s3
+; GFX9-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s3
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -9982,7 +10133,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GFX6-NEXT:    s_mov_b32 s5, s1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -10020,8 +10171,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s9, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
@@ -10038,19 +10189,19 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v5, s13
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v4, s13
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s10
@@ -10180,26 +10331,26 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_subb_u32 s2, s6, s9
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v2
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s2, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s7, s11
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s9
-; GFX9-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    s_subb_u32 s2, s2, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s9
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    s_cmp_eq_u32 s2, s9
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
@@ -10354,7 +10505,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -10391,8 +10542,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -10408,49 +10559,49 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    s_add_u32 s4, s14, s2
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v5, s5
-; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s5, s15, s2
-; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s5
-; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GFX6-NEXT:    s_ashr_i32 s0, s15, 31
+; GFX6-NEXT:    s_add_u32 s2, s14, s0
+; GFX6-NEXT:    s_mov_b32 s1, s0
+; GFX6-NEXT:    s_addc_u32 s3, s15, s0
+; GFX6-NEXT:    v_mov_b32_e32 v4, s5
+; GFX6-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
+; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s5
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
-; GFX6-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
-; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX6-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v6
-; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; GFX6-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
+; GFX6-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    s_sub_u32 s0, 0, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
 ; GFX6-NEXT:    s_subb_u32 s1, 0, s5
-; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
 ; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v3
+; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v4, v5
@@ -10460,15 +10611,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v2, v4
@@ -10511,8 +10662,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
@@ -10529,19 +10680,19 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v7, s7
-; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
+; GFX6-NEXT:    v_mov_b32_e32 v6, s7
+; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, s14
@@ -10672,33 +10823,33 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_subb_u32 s2, s4, s13
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s2, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s5, s15
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s13
-; GFX9-NEXT:    v_mov_b32_e32 v5, s16
-; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    s_subb_u32 s2, s2, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s5, s15
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, s13
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    s_add_u32 s2, s10, s0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_addc_u32 s3, s11, s0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s14, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v3
@@ -10809,26 +10960,26 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_subb_u32 s2, s6, s5
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v3
 ; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s2, 0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    s_subb_u32 s2, s7, s11
-; GFX9-NEXT:    s_cmp_ge_u32 s2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v7, s12
-; GFX9-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NEXT:    s_subb_u32 s2, s2, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s0, s7, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s5
+; GFX9-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
-; GFX9-NEXT:    s_cmp_eq_u32 s2, s5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v8, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v8, s3
+; GFX9-NEXT:    s_cmp_eq_u32 s0, s5
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_mov_b32_e32 v7, s0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s10, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s10

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index d10d0dd74741d..ccf9eec087b25 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -40,12 +40,16 @@ define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 }
 
 ; GCN-LABEL: {{^}}select_and_v4:
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}},
+; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0
+; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0
+; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0
+; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0
+; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
+; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
+; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
+; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
 ; GCN-NOT: v_and_b32
-; GCN:     store_dword
+; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -94,12 +98,16 @@ define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) {
 }
 
 ; GCN-LABEL: {{^}}select_or_v4:
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
-; GCN:     v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}},
+; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1
+; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1
+; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1
+; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1
 ; GCN-NOT: v_or_b32
-; GCN:     store_dword
+; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
+; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
+; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
+; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
+; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @select_or_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) {
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -147,10 +155,15 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(<2 x i
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32:
-; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
-; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 6, 5,
-; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 10, 6,
-; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 14, 7,
+; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14
+; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10
+; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6
+; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2
+; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
+; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
+; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
+; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
+; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(<4 x i32> addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
   %bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
@@ -261,14 +274,16 @@ define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(<2 x half> addrspac
 }
 
 ; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32:
-; GCN-DAG: v_mov_b32_e32 [[T2:v[0-9]+]], 0x40a00000
-; GCN-DAG: v_mov_b32_e32 [[T3:v[0-9]+]], 0x41100000
-; GCN-DAG: v_mov_b32_e32 [[T4:v[0-9]+]], 0x41500000
-; GCN-DAG: v_mov_b32_e32 [[F4:v[0-9]+]], 0x40c00000
-; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0,
-; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, [[T2]],
-; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[T3]],
-; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[F4]], [[T4]],
+; GCN:     s_mov_b32 [[T0:s[0-9]+]], 0x41500000
+; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000
+; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0
+; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0
+; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0
+; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
+; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
+; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
+; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
+; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(<4 x float> addrspace(1)* %p, i1 %cond) {
   %sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
   %bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 22478a74f2a7e..776de7db20657 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,11 +14,12 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    v_add_i32_e64 v0, s[4:5], s6, s6
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX7-NEXT:    s_addc_u32 s4, s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX7-NEXT:    s_addc_u32 s7, s6, 0
+; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX7-NEXT:    s_cselect_b32 s4, s7, 0
 ; GFX7-NEXT:    s_cmp_gt_u32 s6, 31
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -31,11 +32,12 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[4:5], s6, s6
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s6, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    s_addc_u32 s7, s6, 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    s_cselect_b32 s4, s7, 0
 ; GFX9-NEXT:    s_cmp_gt_u32 s6, 31
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -51,10 +53,11 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_cmpk_lg_u32 s5, 0x0
 ; GFX10-NEXT:    s_addc_u32 s5, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX10-NEXT:    s_cmp_gt_u32 s4, 31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, s5, s6
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: s_add_co_select_user:
@@ -65,15 +68,15 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmpk_lg_u32 s1, 0x0
 ; GFX11-NEXT:    s_addc_u32 s1, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, s1, s2
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = load volatile i32, i32 addrspace(4)* null, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 0196de16e1090..c5dd7a2e39b37 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -38,15 +38,15 @@ entry:
 
 ; GCN-LABEL: {{^}}double4_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
@@ -57,18 +57,17 @@ entry:
 
 ; GCN-LABEL: {{^}}double5_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4
-; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 4
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40140a3d, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:
@@ -107,10 +106,9 @@ entry:
 
 ; GCN-LABEL: {{^}}double2_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 094ae27b5c574..ffa9b912eae3a 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -14,14 +14,12 @@ define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
@@ -31,18 +29,15 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 248f5fc985eee..b2f5697383f4d 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -31,10 +31,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
@@ -60,14 +59,12 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
@@ -77,18 +74,15 @@ define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 3
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt

diff  --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index da852af3f2303..a5787714fb7b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,21 +13,20 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; FIXME: We should be using s_addk_i32 here, but the reg allocation hints
-;        are not always followed.
-; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
+; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
 ; SI-DAG: s_andn2_b64
 ; SI-DAG: cmp_gt_i32
-; SI-DAG: cndmask_b32
-; SI-DAG: cndmask_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
 ; SI-DAG: cmp_lt_i32
-; SI-DAG: cndmask_b32
-; SI-DAG: cndmask_b32
-; SI-DAG: v_cmp_gt_f64
-; SI-DAG: v_cmp_lg_f64
-; SI-DAG: v_cndmask_b32
-; SI: v_cndmask_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: v_cmp_gt_f64_e64 [[FCMP:s[[0-9]+:[0-9]+]]]
+; SI-DAG: v_cmp_lg_f64_e32 vcc
+; SI-DAG: s_and_b64 [[AND1:s[[0-9]+:[0-9]+]]], [[FCMP]], vcc
+; SI-DAG: s_and_b64 [[AND1]], [[AND1]], exec
+; SI-DAG: s_cselect_b32 s{{[0-9]+}}, 0x3ff00000, 0
 ; SI: v_add_f64
 ; SI: s_endpgm
 define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 4dd0ff333a013..d8098f6d7ead4 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1002,22 +1002,27 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
 ; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
+; SI-NEXT:    v_readfirstlane_b32 s2, v5
+; SI-NEXT:    s_bfe_u32 s0, s2, 0xb0014
+; SI-NEXT:    s_add_i32 s3, s0, 0xfffffc01
 ; SI-NEXT:    s_mov_b32 s1, 0xfffff
 ; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
-; SI-NEXT:    v_not_b32_e32 v6, v6
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
+; SI-NEXT:    v_not_b32_e32 v6, s0
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, v7
-; SI-NEXT:    v_and_b32_e32 v7, v5, v7
-; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
-; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
+; SI-NEXT:    v_not_b32_e32 v7, s1
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s3, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v7, s0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s3, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s2
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1185,21 +1190,21 @@ define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(
 define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI-LABEL: fast_frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1209,24 +1214,29 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
-; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
-; SI-NEXT:    v_not_b32_e32 v6, v6
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s5, 0xfffff
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; SI-NEXT:    v_not_b32_e32 v6, s4
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, v7
-; SI-NEXT:    v_and_b32_e32 v7, v5, v7
-; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
-; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
+; SI-NEXT:    v_not_b32_e32 v7, s5
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: fast_frem_f64:
@@ -1373,21 +1383,21 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
 define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; SI-LABEL: unsafe_frem_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s8
-; SI-NEXT:    s_mov_b32 s5, s9
-; SI-NEXT:    s_mov_b32 s8, s10
-; SI-NEXT:    s_mov_b32 s9, s11
-; SI-NEXT:    s_mov_b32 s10, s6
-; SI-NEXT:    s_mov_b32 s11, s7
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_mov_b32 s3, s7
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; SI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -1397,24 +1407,29 @@ define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double add
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_bfe_u32 v6, v5, 20, 11
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0xfffffc01, v6
-; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    v_lshr_b64 v[6:7], s[0:1], v8
-; SI-NEXT:    v_not_b32_e32 v6, v6
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s5, 0xfffff
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; SI-NEXT:    v_not_b32_e32 v6, s4
 ; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, v7
-; SI-NEXT:    v_and_b32_e32 v7, v5, v7
-; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v5
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v8
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v8
-; SI-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
+; SI-NEXT:    v_not_b32_e32 v7, s5
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: unsafe_frem_f64:
@@ -3125,21 +3140,26 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
 ; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; SI-NEXT:    v_bfe_u32 v10, v9, 20, 11
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0xfffffc01, v10
+; SI-NEXT:    v_readfirstlane_b32 s8, v9
+; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
+; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    v_lshr_b64 v[10:11], s[2:3], v12
-; SI-NEXT:    v_not_b32_e32 v10, v10
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
+; SI-NEXT:    v_not_b32_e32 v10, s0
 ; SI-NEXT:    v_and_b32_e32 v10, v8, v10
-; SI-NEXT:    v_not_b32_e32 v11, v11
-; SI-NEXT:    v_and_b32_e32 v11, v9, v11
-; SI-NEXT:    v_and_b32_e32 v13, 0x80000000, v9
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v12
-; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v12
-; SI-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
+; SI-NEXT:    v_not_b32_e32 v11, s1
+; SI-NEXT:    v_and_b32_e32 v9, v9, v11
+; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v8, v10, v8, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v11, s0
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SI-NEXT:    s_cmp_gt_i32 s9, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
 ; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
 ; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
 ; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
@@ -3156,20 +3176,25 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
 ; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; SI-NEXT:    v_bfe_u32 v8, v7, 20, 11
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0xfffffc01, v8
-; SI-NEXT:    v_lshr_b64 v[8:9], s[2:3], v10
-; SI-NEXT:    v_not_b32_e32 v8, v8
+; SI-NEXT:    v_readfirstlane_b32 s8, v7
+; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
+; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
+; SI-NEXT:    v_not_b32_e32 v8, s0
 ; SI-NEXT:    v_and_b32_e32 v8, v6, v8
-; SI-NEXT:    v_not_b32_e32 v9, v9
-; SI-NEXT:    v_and_b32_e32 v9, v7, v9
-; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v7
-; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v10
-; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], 51, v10
-; SI-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
+; SI-NEXT:    v_not_b32_e32 v9, s1
+; SI-NEXT:    v_and_b32_e32 v7, v7, v9
+; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v9, s0
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    s_cmp_gt_i32 s9, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index 226125335c38d..1f67442c45a9e 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -11,7 +11,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 
 ; FUNC-LABEL: {{^}}v_ftrunc_f64:
 ; CI: v_trunc_f64
-; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0xb0014
 ; SI: s_endpgm
 define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %x = load double, double addrspace(1)* %in, align 8
@@ -29,11 +29,11 @@ define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrsp
 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
 ; SI-DAG: s_andn2_b64
 ; SI-DAG: cmp_gt_i32
-; SI-DAG: cndmask_b32
-; SI-DAG: cndmask_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
 ; SI-DAG: cmp_lt_i32
-; SI-DAG: cndmask_b32
-; SI-DAG: cndmask_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
 ; SI: s_endpgm
 define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 1c1332a53b594..2781d1e1b5213 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -25,25 +25,21 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_mul_i32 s7, s3, s6
 ; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s6
 ; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_not_b32 s9, s6
 ; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_mul_i32 s9, s4, s9
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
 ; GFX9-NEXT:    s_add_i32 s7, s2, s7
+; GFX9-NEXT:    s_add_i32 s9, s2, s9
 ; GFX9-NEXT:    s_cmp_ge_u32 s7, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_add_i32 s7, s6, 1
-; GFX9-NEXT:    s_not_b32 s6, s6
-; GFX9-NEXT:    s_mul_i32 s6, s4, s6
-; GFX9-NEXT:    v_mov_b32_e32 v4, s7
-; GFX9-NEXT:    s_add_i32 s6, s2, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v2
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -73,23 +69,21 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_mul_i32 s7, s3, s6
 ; GFX10-NEXT:    s_mul_hi_u32 s6, s2, s6
 ; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_not_b32 s8, s6
 ; GFX10-NEXT:    s_mul_i32 s7, s5, s6
+; GFX10-NEXT:    s_mul_i32 s8, s4, s8
 ; GFX10-NEXT:    s_add_i32 s7, s2, s7
+; GFX10-NEXT:    s_add_i32 s9, s6, 1
+; GFX10-NEXT:    s_add_i32 s8, s2, s8
 ; GFX10-NEXT:    s_cmp_ge_u32 s7, s4
-; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX10-NEXT:    s_cselect_b32 s7, s8, s7
 ; GFX10-NEXT:    s_add_i32 s8, s6, 1
-; GFX10-NEXT:    s_not_b32 s9, s6
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    s_mul_i32 s8, s4, s9
-; GFX10-NEXT:    s_add_i32 s8, s2, s8
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX10-NEXT:    s_add_u32 s2, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -127,26 +121,22 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX11-NEXT:    s_mul_i32 s7, s3, s6
 ; GFX11-NEXT:    s_mul_hi_u32 s6, s2, s6
 ; GFX11-NEXT:    s_add_i32 s6, s6, s7
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_not_b32 s8, s6
 ; GFX11-NEXT:    s_mul_i32 s7, s5, s6
+; GFX11-NEXT:    s_mul_i32 s8, s4, s8
 ; GFX11-NEXT:    s_add_i32 s7, s2, s7
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s9, s6, 1
+; GFX11-NEXT:    s_add_i32 s8, s2, s8
 ; GFX11-NEXT:    s_cmp_ge_u32 s7, s4
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, s9, s6
+; GFX11-NEXT:    s_cselect_b32 s7, s8, s7
 ; GFX11-NEXT:    s_add_i32 s8, s6, 1
-; GFX11-NEXT:    s_not_b32 s9, s6
-; GFX11-NEXT:    v_mov_b32_e32 v2, s8
-; GFX11-NEXT:    s_mul_i32 s8, s4, s9
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s8, s2, s8
+; GFX11-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX11-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX11-NEXT:    s_add_u32 s2, s2, 1
-; GFX11-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    s_addc_u32 s3, s3, 0
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, s7, v3 :: v_dual_add_nc_u32 v4, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0
@@ -330,38 +320,39 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-LABEL: sdiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s2
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  .LBB2_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v2, s3
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s6
+; GFX9-NEXT:    s_mul_i32 s7, s6, s3
+; GFX9-NEXT:    s_sub_i32 s7, s4, s7
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
+; GFX9-NEXT:    s_sub_i32 s9, s7, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX9-NEXT:    s_add_i32 s8, s6, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_xor_b32 s6, s6, s2
+; GFX9-NEXT:    s_sub_i32 s6, s6, s2
 ; GFX9-NEXT:    s_add_i32 s4, s4, 1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -393,21 +384,19 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_add_i32 s6, s6, s7
 ; GFX10-NEXT:    s_mul_hi_u32 s6, s4, s6
 ; GFX10-NEXT:    s_mul_i32 s7, s6, s3
+; GFX10-NEXT:    s_add_i32 s8, s6, 1
 ; GFX10-NEXT:    s_sub_i32 s7, s4, s7
+; GFX10-NEXT:    s_sub_i32 s9, s7, s3
 ; GFX10-NEXT:    s_cmp_ge_u32 s7, s3
-; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX10-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX10-NEXT:    s_add_i32 s8, s6, 1
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX10-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX10-NEXT:    s_add_i32 s4, s4, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s8
-; GFX10-NEXT:    s_sub_i32 s8, s7, s3
-; GFX10-NEXT:    v_mov_b32_e32 v3, s8
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    s_xor_b32 s6, s6, s2
+; GFX10-NEXT:    s_sub_i32 s6, s6, s2
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -449,25 +438,21 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_hi_u32 s6, s4, s6
 ; GFX11-NEXT:    s_mul_i32 s7, s6, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s8, s6, 1
 ; GFX11-NEXT:    s_sub_i32 s7, s4, s7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s9, s7, s3
 ; GFX11-NEXT:    s_cmp_ge_u32 s7, s3
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX11-NEXT:    s_cselect_b32 s7, s9, s7
 ; GFX11-NEXT:    s_add_i32 s8, s6, 1
+; GFX11-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX11-NEXT:    s_cselect_b32 s6, s8, s6
 ; GFX11-NEXT:    s_add_i32 s4, s4, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s8
-; GFX11-NEXT:    s_sub_i32 s8, s7, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, s7, v3 :: v_dual_add_nc_u32 v4, 1, v2
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
+; GFX11-NEXT:    s_xor_b32 s6, s6, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s6, s6, s2
+; GFX11-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX11-NEXT:    global_store_b32 v1, v2, s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, 4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 979de26f77caf..1af5080038f63 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -15,17 +15,15 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX8V3-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V3-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8V3-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX8V3-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX8V3-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX8V3-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX8V3-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8V3-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8V3-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX8V3-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX8V3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V3-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V3-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX8V3-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX8V3-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8V3-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8V3-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V3-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8V3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V3-NEXT:    v_mov_b32_e32 v0, 2
@@ -40,17 +38,15 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8V4-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX8V4-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8V4-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX8V4-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8V4-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, 2
@@ -65,17 +61,15 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX8V5-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8V5-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX8V5-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX8V5-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GFX8V5-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8V5-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX8V5-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX8V5-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
+; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8V5-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, 2
@@ -88,22 +82,20 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX9V3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9V3-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9V3-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9V3-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V3-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V3-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V3-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V3-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX9V3-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX9V3-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V3-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX9V3-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX9V3-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9V3-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V3-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9V3-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9V3-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V3-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX9V3-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9V3-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9V3-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX9V3-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9V3-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9V3-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V3-NEXT:    flat_store_dword v[0:1], v4
 ; GFX9V3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V3-NEXT:    v_mov_b32_e32 v0, 2
@@ -116,22 +108,20 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9V4-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9V4-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V4-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V4-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX9V4-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V4-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX9V4-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX9V4-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V4-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9V4-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V4-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9V4-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9V4-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, 2
@@ -144,22 +134,20 @@ define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 add
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9V5-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9V5-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
-; GFX9V5-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V5-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX9V5-NEXT:    s_cselect_b32 s0, s0, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9V5-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
+; GFX9V5-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX9V5-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
-; GFX9V5-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9V5-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9V5-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
+; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9V5-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9V5-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, 2

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 8e93f3a139b49..36053945e1341 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -12,66 +12,32 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b32 s33, s6
-; CHECK-NEXT:    v_mov_b32_e32 v31, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_bitcmp1_b32 s4, 0
-; CHECK-NEXT:    s_cselect_b64 vcc, -1, 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dword s7, s[4:5], 0x0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, wobble at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, wobble at gotpcrel32@hi+12
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, snork at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, snork at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], exec
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s8, snork at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s9, s9, snork at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s9
-; CHECK-NEXT:    v_mov_b32_e32 v1, s11
-; CHECK-NEXT:    v_mov_b32_e32 v2, s8
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CHECK-NEXT:    s_and_b32 s4, 1, s7
+; CHECK-NEXT:    s_cmp_eq_u32 s4, 1
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_cselect_b32 s5, s13, s11
+; CHECK-NEXT:    s_cselect_b32 s4, s12, s10
+; CHECK-NEXT:    s_mov_b32 s12, s6
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readfirstlane_b32 s4, v2
-; CHECK-NEXT:    v_readfirstlane_b32 s5, v3
-; CHECK-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
-; CHECK-NEXT:    s_and_saveexec_b64 s[34:35], vcc
-; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    s_mov_b32 s12, s33
-; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; CHECK-NEXT:    ; implicit-def: $vgpr31
-; CHECK-NEXT:    ; implicit-def: $vgpr1
-; CHECK-NEXT:    s_xor_b64 exec, exec, s[34:35]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
-; CHECK-NEXT:  ; %bb.2:
 ; CHECK-NEXT:    s_endpgm
 
-; CHECK: .amdhsa_kernarg_size 0
-; CHECK-NEXT: .amdhsa_user_sgpr_count 6
-; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; CHECK-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
-; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
-; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; CHECK-NEXT: .amdhsa_uses_dynamic_stack 1
-; CHECK-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
-; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; CHECK-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; CHECK-NEXT: .amdhsa_system_vgpr_workitem_id 0
 bb:
   %cond = load i1, i1 addrspace(4)* null
   %tmp = select i1 %cond, void (i8*, i32, i8*)* bitcast (void ()* @wobble to void (i8*, i32, i8*)*), void (i8*, i32, i8*)* bitcast (void ()* @snork to void (i8*, i32, i8*)*)

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 1c955ee622d2c..c46f853a3da20 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -586,21 +586,18 @@ define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 1
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s7
+; GCN-NEXT:    s_cselect_b32 s6, 0, s6
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_cselect_b32 s2, 0x3ff00000, s5
+; GCN-NEXT:    s_cselect_b32 s4, 0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -617,51 +614,45 @@ define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
 ; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
-; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 4
-; GCN-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v0, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_cselect_b32 s9, 0x3ff00000, s9
+; GCN-NEXT:    s_cselect_b32 s8, 0, s8
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 1
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_cselect_b32 s3, 0x3ff00000, s3
+; GCN-NEXT:    s_cselect_b32 s2, 0, s2
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    s_cselect_b32 s8, 0x3ff00000, s1
+; GCN-NEXT:    s_cselect_b32 s9, 0, s0
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v5, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v5, s6
+; GCN-NEXT:    s_cselect_b32 s0, 0x3ff00000, s7
+; GCN-NEXT:    s_cselect_b32 s1, 0, s6
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 2
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s5, 0x3ff00000, s5
+; GCN-NEXT:    s_cselect_b32 s4, 0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    s_add_u32 s0, s10, 16
-; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    s_addc_u32 s1, s11, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT:    v_mov_b32_e32 v7, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT:    v_mov_b32_e32 v6, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    v_mov_b32_e32 v7, s11
 ; GCN-NEXT:    s_add_u32 s0, s10, 32
-; GCN-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
 ; GCN-NEXT:    s_addc_u32 s1, s11, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 53d3a5d491805..1823db53f1596 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -693,16 +693,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_cmp_lg_u32 s8, 1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s0, s3, 5
 ; SI-NEXT:    s_cmp_lg_u32 s8, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    s_cselect_b32 s1, s2, 5
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -738,17 +736,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s8, 2
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s2, s2, 5
 ; SI-NEXT:    s_cmp_lg_u32 s8, 1
-; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s1, s1, 5
 ; SI-NEXT:    s_cmp_lg_u32 s8, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
+; SI-NEXT:    s_cselect_b32 s0, s0, 5
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -779,31 +774,26 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
-; SI-NEXT:    s_load_dword s4, s[4:5], 0x11
-; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
+; SI-NEXT:    s_load_dword s9, s[4:5], 0x11
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0x100f000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_cmp_eq_u32 s6, 2
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_eq_u32 s6, 1
-; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_eq_u32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-NEXT:    s_cselect_b32 s3, s9, s3
+; SI-NEXT:    s_cmp_eq_u32 s8, 2
+; SI-NEXT:    s_cselect_b32 s2, s9, s2
+; SI-NEXT:    s_cmp_eq_u32 s8, 1
+; SI-NEXT:    s_cselect_b32 s1, s9, s1
+; SI-NEXT:    s_cmp_eq_u32 s8, 0
+; SI-NEXT:    s_cselect_b32 s0, s9, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i32:
@@ -1229,116 +1219,88 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s11, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 15
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s11, 16
+; SI-NEXT:    s_cselect_b32 s4, s4, 5
+; SI-NEXT:    s_lshl_b32 s4, s4, 8
+; SI-NEXT:    s_lshr_b32 s5, s11, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 14
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_lshr_b32 s4, s11, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    s_cselect_b32 s5, s5, 5
+; SI-NEXT:    s_and_b32 s5, s5, 0xff
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshr_b32 s5, s11, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 13
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s5, s5, 5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 12
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s11
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    s_lshr_b32 s4, s10, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_cselect_b32 s7, s11, 5
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_lshr_b32 s5, s10, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 11
-; SI-NEXT:    v_or_b32_e32 v3, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s10, 16
+; SI-NEXT:    s_cselect_b32 s5, s5, 5
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_lshr_b32 s7, s10, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 10
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_lshr_b32 s4, s10, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    s_cselect_b32 s7, s7, 5
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshr_b32 s7, s10, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 9
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s7, s7, 5
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 8
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    s_lshr_b32 s4, s9, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_cselect_b32 s10, s10, 5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_or_b32 s7, s10, s7
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_lshr_b32 s7, s9, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 7
-; SI-NEXT:    v_or_b32_e32 v2, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s9, 16
+; SI-NEXT:    s_cselect_b32 s7, s7, 5
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_lshr_b32 s10, s9, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 6
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    s_lshr_b32 s4, s9, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    s_cselect_b32 s10, s10, 5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_or_b32 s7, s10, s7
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshr_b32 s10, s9, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 5
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s10, s10, 5
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 4
-; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_or_b32_e32 v1, v4, v1
-; SI-NEXT:    s_lshr_b32 s4, s8, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_cselect_b32 s9, s9, 5
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    s_or_b32 s9, s9, s10
+; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_lshr_b32 s9, s8, 24
 ; SI-NEXT:    s_cmp_lg_u32 s6, 3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    s_cselect_b32 s9, s9, 5
+; SI-NEXT:    s_lshl_b32 s9, s9, 8
+; SI-NEXT:    s_lshr_b32 s10, s8, 16
 ; SI-NEXT:    s_cmp_lg_u32 s6, 2
-; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; SI-NEXT:    s_lshr_b32 s4, s8, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_cselect_b32 s10, s10, 5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshr_b32 s10, s8, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 1
-; SI-NEXT:    v_or_b32_e32 v0, v4, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s10, s10, 5
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
 ; SI-NEXT:    s_cmp_lg_u32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v0, v4, v0
+; SI-NEXT:    s_cselect_b32 s6, s8, 5
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_or_b32 s6, s6, s10
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1534,22 +1496,19 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
 ; SI-NEXT:    s_load_dword s8, s[4:5], 0x18
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s8, 1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
+; SI-NEXT:    s_cselect_b32 s2, 0, s2
 ; SI-NEXT:    s_cmp_eq_u32 s8, 0
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
+; SI-NEXT:    s_cselect_b32 s0, 0, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1558,22 +1517,19 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
 ; VI-NEXT:    s_load_dword s8, s[4:5], 0x60
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s8, 1
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_cselect_b32 s3, 0x40200000, s3
+; VI-NEXT:    s_cselect_b32 s2, 0, s2
 ; VI-NEXT:    s_cmp_eq_u32 s8, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; VI-NEXT:    s_cselect_b32 s1, 0x40200000, s1
+; VI-NEXT:    s_cselect_b32 s0, 0, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
@@ -1584,47 +1540,43 @@ define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)
 define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s10, s[4:5], 0x8
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x8
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s10, 1
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_cmp_eq_u32 s10, 0
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
+; SI-NEXT:    s_cmp_eq_u32 s8, 1
+; SI-NEXT:    s_cselect_b32 s3, 0, s3
+; SI-NEXT:    s_cselect_b32 s2, 5, s2
+; SI-NEXT:    s_cmp_eq_u32 s8, 0
+; SI-NEXT:    s_cselect_b32 s1, 0, s1
+; SI-NEXT:    s_cselect_b32 s0, 5, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s2
+; SI-NEXT:    v_mov_b32_e32 v3, s3
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s10, s[4:5], 0x20
+; VI-NEXT:    s_load_dword s8, s[4:5], 0x20
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s10, 1
-; VI-NEXT:    v_mov_b32_e32 v0, s3
-; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[8:9]
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_cmp_eq_u32 s10, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[8:9]
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[2:3]
+; VI-NEXT:    s_cmp_eq_u32 s8, 1
+; VI-NEXT:    s_cselect_b32 s3, 0, s3
+; VI-NEXT:    s_cselect_b32 s2, 5, s2
+; VI-NEXT:    s_cmp_eq_u32 s8, 0
+; VI-NEXT:    s_cselect_b32 s1, 0, s1
+; VI-NEXT:    s_cselect_b32 s0, 5, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
@@ -1635,63 +1587,57 @@ define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %
 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s12, s[4:5], 0x10
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xc
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s12, 1
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    s_cmp_eq_u32 s12, 0
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    s_cmp_eq_u32 s12, 2
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_cmp_eq_u32 s6, 1
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
-; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_cselect_b32 s7, 0, s11
+; SI-NEXT:    s_cselect_b32 s10, 5, s10
+; SI-NEXT:    s_cmp_eq_u32 s6, 0
+; SI-NEXT:    s_cselect_b32 s9, 0, s9
+; SI-NEXT:    s_cselect_b32 s8, 5, s8
+; SI-NEXT:    s_cmp_eq_u32 s6, 2
+; SI-NEXT:    s_cselect_b32 s5, 0, s5
+; SI-NEXT:    s_cselect_b32 s4, 5, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v3i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s12, s[4:5], 0x40
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x30
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_eq_u32 s12, 1
-; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s10
-; VI-NEXT:    s_cmp_eq_u32 s12, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    s_cmp_eq_u32 s12, 2
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[6:7]
-; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_cmp_eq_u32 s6, 1
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[6:7]
-; VI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_cselect_b32 s7, 0, s11
+; VI-NEXT:    s_cselect_b32 s10, 5, s10
+; VI-NEXT:    s_cmp_eq_u32 s6, 0
+; VI-NEXT:    s_cselect_b32 s9, 0, s9
+; VI-NEXT:    s_cselect_b32 s8, 5, s8
+; VI-NEXT:    s_cmp_eq_u32 s6, 2
+; VI-NEXT:    s_cselect_b32 s5, 0, s5
+; VI-NEXT:    s_cselect_b32 s4, 5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
@@ -1704,36 +1650,32 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s6, s[4:5], 0x10
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
-; SI-NEXT:    v_mov_b32_e32 v4, 0x40200000
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s6, 1
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cselect_b32 s4, 0x40200000, s11
+; SI-NEXT:    s_cselect_b32 s5, 0, s10
 ; SI-NEXT:    s_cmp_eq_u32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_cselect_b32 s7, 0x40200000, s9
+; SI-NEXT:    s_cselect_b32 s8, 0, s8
 ; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    s_cselect_b32 s9, 0x40200000, s15
+; SI-NEXT:    s_cselect_b32 s10, 0, s14
 ; SI-NEXT:    s_cmp_eq_u32 s6, 2
-; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s12
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_cselect_b32 s6, 0x40200000, s13
+; SI-NEXT:    s_cselect_b32 s11, 0, s12
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1741,36 +1683,32 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s6, s[4:5], 0x40
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
-; VI-NEXT:    v_mov_b32_e32 v4, 0x40200000
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s6, 1
-; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cselect_b32 s4, 0x40200000, s11
+; VI-NEXT:    s_cselect_b32 s5, 0, s10
 ; VI-NEXT:    s_cmp_eq_u32 s6, 0
-; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    s_cselect_b32 s7, 0x40200000, s9
+; VI-NEXT:    s_cselect_b32 s8, 0, s8
 ; VI-NEXT:    s_cmp_eq_u32 s6, 3
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s15
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s14
+; VI-NEXT:    s_cselect_b32 s9, 0x40200000, s15
+; VI-NEXT:    s_cselect_b32 s10, 0, s14
 ; VI-NEXT:    s_cmp_eq_u32 s6, 2
-; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s13
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, s12
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_cselect_b32 s6, 0x40200000, s13
+; VI-NEXT:    s_cselect_b32 s11, 0, s12
+; VI-NEXT:    v_mov_b32_e32 v0, s11
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_nop 0
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index ade927bdd19be..0a62c42969bb9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -284,10 +284,9 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
 ; SI-NEXT:    s_mul_i32 s1, s1, s3
-; SI-NEXT:    s_mul_i32 s0, s0, s2
+; SI-NEXT:    s_mul_i32 s2, s0, s2
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s0
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
@@ -297,8 +296,10 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
 ; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 0, s2
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -322,13 +323,12 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX9-NEXT:    s_add_i32 s1, s8, s7
 ; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_mul_i32 s2, s0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -354,9 +354,10 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX10-NEXT:    s_add_i32 s1, s1, s6
 ; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -382,10 +383,10 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX11-NEXT:    s_add_i32 s1, s1, s6
 ; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX11-NEXT:    s_cselect_b32 s1, 0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -413,35 +414,38 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; SI-NEXT:    v_mul_hi_i32 v2, s1, v2
 ; SI-NEXT:    s_mul_i32 s6, s1, s3
-; SI-NEXT:    s_cmp_lt_i32 s1, 0
-; SI-NEXT:    s_mul_i32 s1, s0, s2
-; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
-; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s1
-; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; SI-NEXT:    s_mul_i32 s8, s0, s2
+; SI-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-NEXT:    v_readfirstlane_b32 s10, v3
+; SI-NEXT:    v_readfirstlane_b32 s11, v0
+; SI-NEXT:    v_readfirstlane_b32 s12, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v0
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v1
-; SI-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v2, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_lt_i32 s3, 0
-; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v1, v3, vcc
+; SI-NEXT:    s_add_u32 s5, s11, s5
+; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v0
+; SI-NEXT:    s_addc_u32 s10, 0, s10
+; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; SI-NEXT:    s_add_u32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
-; SI-NEXT:    v_subrev_i32_e32 v7, vcc, s0, v6
-; SI-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v2, vcc
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
+; SI-NEXT:    s_addc_u32 s4, s10, s9
+; SI-NEXT:    s_addc_u32 s5, s12, 0
+; SI-NEXT:    s_add_u32 s4, s4, s6
+; SI-NEXT:    s_addc_u32 s5, 0, s5
+; SI-NEXT:    s_sub_u32 s2, s4, s2
+; SI-NEXT:    s_subb_u32 s6, s5, 0
+; SI-NEXT:    s_cmp_lt_i32 s1, 0
+; SI-NEXT:    s_cselect_b32 s1, s6, s5
+; SI-NEXT:    s_cselect_b32 s2, s2, s4
+; SI-NEXT:    s_sub_u32 s0, s2, s0
+; SI-NEXT:    s_subb_u32 s4, s1, 0
+; SI-NEXT:    s_cmp_lt_i32 s3, 0
+; SI-NEXT:    s_cselect_b32 s1, s4, s1
+; SI-NEXT:    s_cselect_b32 s0, s0, s2
+; SI-NEXT:    v_cmp_ne_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_cselect_b32 s0, 0, s8
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -451,44 +455,38 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mul_i32 s7, s0, s3
 ; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
 ; GFX9-NEXT:    s_add_u32 s9, s8, s7
-; GFX9-NEXT:    s_mul_i32 s5, s1, s2
-; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_mul_i32 s6, s1, s2
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT:    s_add_u32 s9, s9, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s6
 ; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT:    s_addc_u32 s4, s6, s4
-; GFX9-NEXT:    s_addc_u32 s6, s10, 0
+; GFX9-NEXT:    s_addc_u32 s4, s5, s4
+; GFX9-NEXT:    s_addc_u32 s5, s10, 0
 ; GFX9-NEXT:    s_mul_i32 s9, s1, s3
 ; GFX9-NEXT:    s_add_u32 s4, s4, s9
-; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX9-NEXT:    s_sub_u32 s9, s4, s2
-; GFX9-NEXT:    s_subb_u32 s10, s6, 0
+; GFX9-NEXT:    s_subb_u32 s10, s5, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s9
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_cselect_b32 s1, s10, s5
+; GFX9-NEXT:    s_sub_u32 s9, s4, s0
+; GFX9-NEXT:    s_subb_u32 s5, s1, 0
 ; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s1
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
 ; GFX9-NEXT:    s_add_i32 s1, s8, s7
-; GFX9-NEXT:    s_add_i32 s1, s1, s5
-; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX9-NEXT:    s_mov_b32 s7, s6
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s2
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
+; GFX9-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX9-NEXT:    s_cselect_b32 s0, 0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -498,40 +496,38 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_mul_i32 s7, s0, s3
 ; GFX10-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT:    s_mul_hi_u32 s6, s0, s3
-; GFX10-NEXT:    s_mul_i32 s5, s1, s2
+; GFX10-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX10-NEXT:    s_mul_i32 s6, s1, s2
 ; GFX10-NEXT:    s_add_u32 s11, s8, s7
 ; GFX10-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT:    s_addc_u32 s6, 0, s6
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX10-NEXT:    s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT:    s_add_u32 s11, s11, s5
+; GFX10-NEXT:    s_add_u32 s11, s11, s6
 ; GFX10-NEXT:    s_mul_i32 s10, s1, s3
-; GFX10-NEXT:    s_addc_u32 s4, s6, s4
-; GFX10-NEXT:    s_addc_u32 s6, s9, 0
+; GFX10-NEXT:    s_addc_u32 s4, s5, s4
+; GFX10-NEXT:    s_addc_u32 s5, s9, 0
 ; GFX10-NEXT:    s_add_u32 s4, s4, s10
-; GFX10-NEXT:    s_addc_u32 s6, 0, s6
+; GFX10-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX10-NEXT:    s_sub_u32 s9, s4, s2
-; GFX10-NEXT:    s_subb_u32 s10, s6, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    s_subb_u32 s10, s5, 0
 ; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s10
-; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_cselect_b32 s1, s9, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s10, s5
+; GFX10-NEXT:    s_sub_u32 s9, s1, s0
+; GFX10-NEXT:    s_subb_u32 s5, s4, 0
 ; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
-; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX10-NEXT:    s_add_i32 s1, s8, s7
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    s_add_i32 s1, s1, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 s5, s5, s4
+; GFX10-NEXT:    s_cselect_b32 s4, s9, s1
+; GFX10-NEXT:    s_add_i32 s1, s8, s7
+; GFX10-NEXT:    s_add_i32 s1, s1, s6
+; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX10-NEXT:    s_mov_b32 s7, s6
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX10-NEXT:    s_cselect_b32 s1, 0, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -541,42 +537,40 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mul_i32 s7, s0, s3
 ; GFX11-NEXT:    s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT:    s_mul_hi_u32 s6, s0, s3
-; GFX11-NEXT:    s_mul_i32 s5, s1, s2
+; GFX11-NEXT:    s_mul_hi_u32 s5, s0, s3
+; GFX11-NEXT:    s_mul_i32 s6, s1, s2
 ; GFX11-NEXT:    s_add_u32 s11, s8, s7
 ; GFX11-NEXT:    s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT:    s_addc_u32 s6, 0, s6
+; GFX11-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX11-NEXT:    s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT:    s_add_u32 s11, s11, s5
+; GFX11-NEXT:    s_add_u32 s11, s11, s6
 ; GFX11-NEXT:    s_mul_i32 s10, s1, s3
-; GFX11-NEXT:    s_addc_u32 s4, s6, s4
-; GFX11-NEXT:    s_addc_u32 s6, s9, 0
+; GFX11-NEXT:    s_addc_u32 s4, s5, s4
+; GFX11-NEXT:    s_addc_u32 s5, s9, 0
 ; GFX11-NEXT:    s_add_u32 s4, s4, s10
-; GFX11-NEXT:    s_addc_u32 s6, 0, s6
+; GFX11-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX11-NEXT:    s_sub_u32 s9, s4, s2
-; GFX11-NEXT:    s_subb_u32 s10, s6, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10
+; GFX11-NEXT:    s_subb_u32 s10, s5, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, s9, s4
+; GFX11-NEXT:    s_cselect_b32 s4, s10, s5
+; GFX11-NEXT:    s_sub_u32 s9, s1, s0
+; GFX11-NEXT:    s_subb_u32 s5, s4, 0
 ; GFX11-NEXT:    s_cmp_lt_i32 s3, 0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
-; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_add_i32 s1, s8, s7
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s2
-; GFX11-NEXT:    s_add_i32 s1, s1, s5
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3
-; GFX11-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX11-NEXT:    s_cselect_b32 s5, s5, s4
+; GFX11-NEXT:    s_cselect_b32 s4, s9, s1
+; GFX11-NEXT:    s_add_i32 s1, s8, s7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s1, s1, s6
+; GFX11-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s7, s6
+; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], s[6:7]
+; GFX11-NEXT:    s_cselect_b32 s0, 0, s0
+; GFX11-NEXT:    s_cselect_b32 s1, 0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mov_b32 s5, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 995379cdf5b13..8b21896d64603 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -5,42 +5,37 @@
 define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
 ; SI-LABEL: round_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
-; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s9, 0xfffff
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    v_mov_b32_e32 v2, 0x3ff00000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s0, s10
-; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
-; SI-NEXT:    s_andn2_b64 s[2:3], s[6:7], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s4, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s4, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_bfe_u32 s0, s3, 0xb0014
+; SI-NEXT:    s_addk_i32 s0, 0xfc01
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], s0
+; SI-NEXT:    s_andn2_b64 s[8:9], s[2:3], s[8:9]
+; SI-NEXT:    s_and_b32 s5, s3, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s0, 0
+; SI-NEXT:    s_cselect_b32 s8, 0, s8
+; SI-NEXT:    s_cselect_b32 s5, s5, s9
+; SI-NEXT:    s_cmp_gt_i32 s0, 51
+; SI-NEXT:    s_cselect_b32 s8, s2, s8
+; SI-NEXT:    s_cselect_b32 s9, s3, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_add_f64 v[0:1], s[2:3], -v[0:1]
 ; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_mov_b32_e32 v5, s7
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    v_bfi_b32 v4, s0, v4, v5
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
-; SI-NEXT:    s_mov_b32 s9, s5
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NEXT:    v_mov_b32_e32 v3, s3
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    v_bfi_b32 v2, s0, v2, v3
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_add_f64 v[0:1], s[8:9], v[0:1]
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_f64:
@@ -144,66 +139,56 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
 define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
 ; SI-LABEL: round_v2f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s9, 0xfffff
+; SI-NEXT:    s_mov_b32 s8, s2
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s0, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
-; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    s_bfe_u32 s3, s7, 0xb0014
+; SI-NEXT:    s_addk_i32 s3, 0xfc01
+; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], s3
+; SI-NEXT:    s_andn2_b64 s[10:11], s[6:7], s[10:11]
+; SI-NEXT:    s_and_b32 s12, s7, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s3, 0
+; SI-NEXT:    s_cselect_b32 s10, 0, s10
+; SI-NEXT:    s_cselect_b32 s11, s12, s11
+; SI-NEXT:    s_cmp_gt_i32 s3, 51
+; SI-NEXT:    s_cselect_b32 s10, s6, s10
+; SI-NEXT:    s_cselect_b32 s11, s7, s11
+; SI-NEXT:    v_mov_b32_e32 v0, s10
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s10, s0, 0xfffffc01
-; SI-NEXT:    s_brev_b32 s7, -2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s11
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
-; SI-NEXT:    v_bfi_b32 v4, s7, v6, v4
-; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    v_add_f64 v[0:1], s[6:7], -v[0:1]
+; SI-NEXT:    s_brev_b32 s3, -2
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    v_bfi_b32 v2, s3, v4, v2
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT:    v_add_f64 v[2:3], s[10:11], v[0:1]
+; SI-NEXT:    s_add_i32 s10, s6, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], s10
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT:    s_and_b32 s8, s5, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s10, 0
-; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s6, 0, s6
+; SI-NEXT:    s_cselect_b32 s7, s8, s7
 ; SI-NEXT:    s_cmp_gt_i32 s10, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v7, s9
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    v_bfi_b32 v6, s7, v6, v7
-; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, 0
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_cselect_b32 s6, s4, s6
+; SI-NEXT:    s_cselect_b32 s7, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    v_bfi_b32 v4, s3, v4, v5
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v2f64:
@@ -242,116 +227,96 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 ; SI-LABEL: round_v4f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
-; SI-NEXT:    s_mov_b32 s14, -1
-; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s14
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s13, 0xfffff
+; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
-; SI-NEXT:    s_add_i32 s18, s0, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s18
-; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s7, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s18, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s17
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s18, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s17, s0, 0xfffffc01
+; SI-NEXT:    s_bfe_u32 s12, s7, 0xb0014
+; SI-NEXT:    s_add_i32 s16, s12, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_lshr_b64 s[14:15], s[12:13], s16
+; SI-NEXT:    s_andn2_b64 s[14:15], s[6:7], s[14:15]
+; SI-NEXT:    s_and_b32 s17, s7, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s16, 0
+; SI-NEXT:    s_cselect_b32 s14, 0, s14
+; SI-NEXT:    s_cselect_b32 s15, s17, s15
+; SI-NEXT:    s_cmp_gt_i32 s16, 51
+; SI-NEXT:    s_cselect_b32 s14, s6, s14
+; SI-NEXT:    s_cselect_b32 s15, s7, s15
+; SI-NEXT:    v_mov_b32_e32 v0, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s15
+; SI-NEXT:    v_add_f64 v[0:1], s[6:7], -v[0:1]
 ; SI-NEXT:    s_brev_b32 s16, -2
-; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
-; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
-; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s5, 0x80000000
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_cmp_lt_i32 s17, 0
-; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s17, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    v_bfi_b32 v2, s16, v8, v2
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT:    v_add_f64 v[2:3], s[14:15], v[0:1]
+; SI-NEXT:    s_add_i32 s14, s6, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[6:7], s[12:13], s14
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT:    s_and_b32 s15, s5, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s14, 0
+; SI-NEXT:    s_cselect_b32 s6, 0, s6
+; SI-NEXT:    s_cselect_b32 s7, s15, s7
+; SI-NEXT:    s_cmp_gt_i32 s14, 51
+; SI-NEXT:    s_cselect_b32 s6, s4, s6
+; SI-NEXT:    s_cselect_b32 s7, s5, s7
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s4, s11, 0xb0014
+; SI-NEXT:    s_add_i32 s14, s4, 0xfffffc01
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[12:13], s14
+; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[4:5]
+; SI-NEXT:    s_and_b32 s15, s11, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s14, 0
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s15, s5
+; SI-NEXT:    s_cmp_gt_i32 s14, 51
+; SI-NEXT:    v_bfi_b32 v4, s16, v8, v4
+; SI-NEXT:    s_cselect_b32 s4, s10, s4
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; SI-NEXT:    s_cselect_b32 s5, s11, s5
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
-; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
-; SI-NEXT:    s_add_i32 s6, s0, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
-; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v6, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
-; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
-; SI-NEXT:    s_cmp_lt_i32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 51
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s11
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
+; SI-NEXT:    v_bfi_b32 v6, s16, v8, v6
+; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    v_add_f64 v[6:7], s[4:5], v[4:5]
+; SI-NEXT:    s_bfe_u32 s4, s9, 0xb0014
+; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[4:5], s[12:13], s10
+; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[4:5]
+; SI-NEXT:    s_and_b32 s11, s9, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s11, s5
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
+; SI-NEXT:    s_cselect_b32 s4, s8, s4
+; SI-NEXT:    s_cselect_b32 s5, s9, s5
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s4, s0, 0xfffffc01
-; SI-NEXT:    v_mov_b32_e32 v10, s11
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
-; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
-; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, 0x80000000
-; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_cmp_lt_i32 s4, 0
-; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    v_mov_b32_e32 v4, s3
-; SI-NEXT:    v_mov_b32_e32 v5, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s4, 51
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s9
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s8
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; SI-NEXT:    v_add_f64 v[10:11], s[8:9], -v[4:5]
-; SI-NEXT:    v_mov_b32_e32 v13, s9
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
-; SI-NEXT:    v_bfi_b32 v12, s16, v12, v13
-; SI-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, 0
-; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
-; SI-NEXT:    s_mov_b32 s15, 0xf000
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v9, s9
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; SI-NEXT:    v_bfi_b32 v8, s16, v8, v9
+; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_add_f64 v[4:5], s[4:5], v[4:5]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v4f64:
@@ -407,219 +372,178 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-LABEL: round_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
-; SI-NEXT:    s_mov_b32 s22, -1
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s21, 0xfffff
-; SI-NEXT:    s_mov_b32 s20, s22
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
-; SI-NEXT:    s_add_i32 s26, s2, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s26
-; SI-NEXT:    s_and_b32 s23, s7, 0x80000000
-; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
-; SI-NEXT:    s_cmp_lt_i32 s26, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s25
-; SI-NEXT:    v_mov_b32_e32 v1, s23
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s26, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s24, s2, 0xfffffc01
-; SI-NEXT:    s_brev_b32 s23, -2
-; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s24
-; SI-NEXT:    v_bfi_b32 v4, s23, v8, v4
-; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_bfe_u32 s20, s7, 0xb0014
+; SI-NEXT:    s_add_i32 s24, s20, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s20, s2
+; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], s24
+; SI-NEXT:    s_andn2_b64 s[22:23], s[6:7], s[22:23]
+; SI-NEXT:    s_and_b32 s25, s7, 0x80000000
 ; SI-NEXT:    s_cmp_lt_i32 s24, 0
-; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cselect_b32 s22, 0, s22
+; SI-NEXT:    s_cselect_b32 s23, s25, s23
 ; SI-NEXT:    s_cmp_gt_i32 s24, 51
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    s_cselect_b32 s22, s6, s22
+; SI-NEXT:    s_cselect_b32 s23, s7, s23
+; SI-NEXT:    v_mov_b32_e32 v0, s22
+; SI-NEXT:    v_mov_b32_e32 v1, s23
+; SI-NEXT:    v_add_f64 v[0:1], s[6:7], -v[0:1]
+; SI-NEXT:    s_brev_b32 s6, -2
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    v_bfi_b32 v2, s6, v8, v2
+; SI-NEXT:    s_bfe_u32 s7, s5, 0xb0014
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_addk_i32 s7, 0xfc01
+; SI-NEXT:    v_add_f64 v[2:3], s[22:23], v[0:1]
+; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[22:23], s[4:5], s[22:23]
+; SI-NEXT:    s_and_b32 s24, s5, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s22, 0, s22
+; SI-NEXT:    s_cselect_b32 s23, s24, s23
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s22, s4, s22
+; SI-NEXT:    s_cselect_b32 s23, s5, s23
+; SI-NEXT:    v_mov_b32_e32 v0, s22
+; SI-NEXT:    v_mov_b32_e32 v1, s23
+; SI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[0:1]
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
+; SI-NEXT:    s_bfe_u32 s4, s11, 0xb0014
+; SI-NEXT:    v_bfi_b32 v4, s6, v8, v4
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s7
+; SI-NEXT:    v_add_f64 v[0:1], s[22:23], v[0:1]
+; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[4:5]
+; SI-NEXT:    s_and_b32 s22, s11, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s22, s5
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s4, s10, s4
+; SI-NEXT:    s_cselect_b32 s5, s11, s5
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
-; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
-; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v6, s11
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
-; SI-NEXT:    v_bfi_b32 v6, s23, v8, v6
-; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s11, 0x80000000
+; SI-NEXT:    v_bfi_b32 v6, s6, v8, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
-; SI-NEXT:    s_cmp_lt_i32 s6, 0
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 51
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s11
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
-; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s2, 0xfffffc01
-; SI-NEXT:    v_mov_b32_e32 v9, s11
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
-; SI-NEXT:    v_bfi_b32 v9, s23, v8, v9
-; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s9, 0x80000000
-; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_cmp_lt_i32 s6, 0
-; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    v_mov_b32_e32 v4, s5
-; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 51
-; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s9
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
+; SI-NEXT:    v_add_f64 v[6:7], s[4:5], v[4:5]
+; SI-NEXT:    s_bfe_u32 s4, s9, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[4:5]
+; SI-NEXT:    s_and_b32 s10, s9, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s10, s5
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s4, s8, s4
+; SI-NEXT:    s_cselect_b32 s5, s9, s5
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s8
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
-; SI-NEXT:    v_add_f64 v[9:10], s[8:9], -v[4:5]
-; SI-NEXT:    s_add_i32 s4, s2, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s4
-; SI-NEXT:    v_mov_b32_e32 v11, s9
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v9, s9
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; SI-NEXT:    v_bfi_b32 v9, s6, v8, v9
+; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    v_add_f64 v[4:5], s[4:5], v[4:5]
+; SI-NEXT:    s_bfe_u32 s4, s15, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[4:5], s[14:15], s[4:5]
+; SI-NEXT:    s_and_b32 s8, s15, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s8, s5
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s5, s15, s5
+; SI-NEXT:    s_cselect_b32 s4, s14, s4
+; SI-NEXT:    v_mov_b32_e32 v10, s5
+; SI-NEXT:    v_mov_b32_e32 v9, s4
+; SI-NEXT:    v_add_f64 v[9:10], s[14:15], -v[9:10]
+; SI-NEXT:    v_mov_b32_e32 v11, s15
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
-; SI-NEXT:    s_andn2_b64 s[24:25], s[14:15], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s15, 0x80000000
-; SI-NEXT:    v_bfi_b32 v11, s23, v8, v11
-; SI-NEXT:    s_cmp_lt_i32 s4, 0
+; SI-NEXT:    v_bfi_b32 v11, s6, v8, v11
 ; SI-NEXT:    v_cndmask_b32_e32 v10, 0, v11, vcc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s4, 51
-; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[9:10]
-; SI-NEXT:    v_mov_b32_e32 v10, s2
-; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    v_add_f64 v[10:11], s[4:5], v[9:10]
 ; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
-; SI-NEXT:    s_add_i32 s6, s4, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s6
-; SI-NEXT:    s_andn2_b64 s[26:27], s[12:13], s[4:5]
-; SI-NEXT:    s_and_b32 s4, s13, 0x80000000
-; SI-NEXT:    v_mov_b32_e32 v9, s25
-; SI-NEXT:    s_cmp_lt_i32 s6, 0
-; SI-NEXT:    v_cndmask_b32_e32 v15, v9, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 51
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
-; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
-; SI-NEXT:    s_andn2_b64 s[28:29], s[18:19], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s19, 0x80000000
-; SI-NEXT:    v_mov_b32_e32 v9, s27
-; SI-NEXT:    s_cmp_lt_i32 s10, 0
-; SI-NEXT:    v_cndmask_b32_e64 v17, v9, v10, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v9, s29
-; SI-NEXT:    v_mov_b32_e32 v10, s8
-; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s10, 51
-; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v10, s19
-; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v10, v9, v10, s[10:11]
-; SI-NEXT:    v_mov_b32_e32 v9, s28
-; SI-NEXT:    v_cndmask_b32_e64 v9, v9, 0, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v11, s18
-; SI-NEXT:    s_bfe_u32 s8, s17, 0xb0014
-; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[10:11]
-; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s10
-; SI-NEXT:    s_andn2_b64 s[20:21], s[16:17], s[8:9]
-; SI-NEXT:    s_and_b32 s8, s17, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s10, 0
-; SI-NEXT:    v_mov_b32_e32 v11, s21
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[4:5], s[12:13], s[4:5]
+; SI-NEXT:    s_and_b32 s8, s13, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s4, 0, s4
+; SI-NEXT:    s_cselect_b32 s5, s8, s5
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s5, s13, s5
+; SI-NEXT:    s_cselect_b32 s4, s12, s4
+; SI-NEXT:    s_bfe_u32 s7, s19, 0xb0014
+; SI-NEXT:    s_addk_i32 s7, 0xfc01
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], s7
+; SI-NEXT:    s_andn2_b64 s[8:9], s[18:19], s[8:9]
+; SI-NEXT:    s_and_b32 s10, s19, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b32 s8, 0, s8
+; SI-NEXT:    s_cselect_b32 s9, s10, s9
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b32 s9, s19, s9
+; SI-NEXT:    s_cselect_b32 s8, s18, s8
+; SI-NEXT:    s_bfe_u32 s7, s17, 0xb0014
+; SI-NEXT:    v_mov_b32_e32 v13, s5
+; SI-NEXT:    s_addk_i32 s7, 0xfc01
+; SI-NEXT:    v_mov_b32_e32 v12, s4
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], s7
+; SI-NEXT:    v_add_f64 v[12:13], s[12:13], -v[12:13]
+; SI-NEXT:    s_andn2_b64 s[10:11], s[16:17], s[10:11]
+; SI-NEXT:    s_and_b32 s12, s17, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v13, s9
+; SI-NEXT:    s_cselect_b32 s10, 0, s10
+; SI-NEXT:    s_cselect_b32 s11, s12, s11
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
 ; SI-NEXT:    v_mov_b32_e32 v12, s8
-; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s10, 51
-; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v12, s17
-; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v14, v11, v12, s[10:11]
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v12, s16
-; SI-NEXT:    v_cndmask_b32_e64 v13, v11, v12, s[10:11]
-; SI-NEXT:    v_add_f64 v[11:12], s[16:17], -v[13:14]
-; SI-NEXT:    v_mov_b32_e32 v19, s17
-; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v11, s19
-; SI-NEXT:    v_bfi_b32 v20, s23, v8, v11
-; SI-NEXT:    v_add_f64 v[11:12], s[18:19], -v[9:10]
-; SI-NEXT:    v_bfi_b32 v19, s23, v8, v19
-; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v11, 0
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, v20, s[10:11]
-; SI-NEXT:    v_add_f64 v[11:12], v[9:10], v[11:12]
-; SI-NEXT:    v_cndmask_b32_e64 v10, 0, v19, s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v9, 0
-; SI-NEXT:    v_mov_b32_e32 v16, s15
-; SI-NEXT:    v_add_f64 v[9:10], v[13:14], v[9:10]
-; SI-NEXT:    v_mov_b32_e32 v13, s24
-; SI-NEXT:    v_cndmask_b32_e64 v14, v15, v16, s[2:3]
-; SI-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v15, s14
-; SI-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_bfi_b32 v19, s23, v8, v15
-; SI-NEXT:    v_mov_b32_e32 v15, s26
-; SI-NEXT:    v_mov_b32_e32 v18, s13
-; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v16, s12
-; SI-NEXT:    v_cndmask_b32_e64 v18, v17, v18, s[6:7]
-; SI-NEXT:    v_cndmask_b32_e64 v17, v15, v16, s[6:7]
-; SI-NEXT:    v_mov_b32_e32 v15, s13
-; SI-NEXT:    v_bfi_b32 v8, s23, v8, v15
-; SI-NEXT:    v_add_f64 v[15:16], s[12:13], -v[17:18]
-; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5
-; SI-NEXT:    v_add_f64 v[15:16], s[14:15], -v[13:14]
-; SI-NEXT:    s_mov_b32 s23, 0xf000
-; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5
-; SI-NEXT:    v_mov_b32_e32 v15, 0
-; SI-NEXT:    v_cndmask_b32_e64 v16, 0, v19, s[0:1]
-; SI-NEXT:    v_add_f64 v[15:16], v[13:14], v[15:16]
-; SI-NEXT:    v_cndmask_b32_e32 v14, 0, v8, vcc
-; SI-NEXT:    v_mov_b32_e32 v13, 0
-; SI-NEXT:    v_add_f64 v[13:14], v[17:18], v[13:14]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48
-; SI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; SI-NEXT:    s_cselect_b32 s11, s17, s11
+; SI-NEXT:    v_mov_b32_e32 v9, s13
+; SI-NEXT:    v_add_f64 v[12:13], s[18:19], -v[12:13]
+; SI-NEXT:    s_cselect_b32 s10, s16, s10
+; SI-NEXT:    v_mov_b32_e32 v15, s11
+; SI-NEXT:    v_bfi_b32 v9, s6, v8, v9
+; SI-NEXT:    v_mov_b32_e32 v14, s10
+; SI-NEXT:    v_cndmask_b32_e32 v17, 0, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v9, s19
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[12:13]|, 0.5
+; SI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[14:15]
+; SI-NEXT:    v_bfi_b32 v9, s6, v8, v9
+; SI-NEXT:    v_cndmask_b32_e32 v13, 0, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
+; SI-NEXT:    v_bfi_b32 v8, s6, v8, v9
+; SI-NEXT:    v_mov_b32_e32 v12, 0
+; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
+; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_mov_b32_e32 v16, 0
+; SI-NEXT:    v_add_f64 v[14:15], s[8:9], v[12:13]
+; SI-NEXT:    v_add_f64 v[12:13], s[10:11], v[8:9]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_add_f64 v[8:9], s[4:5], v[16:17]
+; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v8f64:

diff  --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 407a4e5f1b764..86bf754ecfcf0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -9,8 +9,8 @@
 ; GCN: s_load_dwordx2
 
 ; GCN: s_cmp_eq_u32
-; GCN: v_cndmask_b32
-; GCN: v_cndmask_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 
 ; GCN-NOT: load_dword
 ; GCN: flat_load_dwordx2
@@ -35,8 +35,8 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
-; GCN: v_cndmask_b32
-; GCN: v_cndmask_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 ; GCN: flat_store_dwordx2
 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
   %tmp2 = icmp eq i32 %tmp, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 8b9f89b3b031c..db8904ef71e82 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -74,7 +74,7 @@ entry:
 ; Check that the select instruction is not deleted.
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
-; SI: v_cndmask
+; SI: s_cselect
 ; GCN2: s_cselect
 define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 16c5be25b7902..94c946321499e 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -259,7 +259,7 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
 ; GCN-DAG: s_and_b32
 ; GCN-DAG: s_sub_i32
 ; GCN-DAG: s_lshr_b32
-; GCN: v_add_i32_e32
+; GCN: s_add_i32
 define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
@@ -275,8 +275,8 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
 ; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
@@ -294,7 +294,7 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)*
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 5df7470f65f2c..465acccd3d4d9 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v4, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
@@ -91,11 +91,11 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; TONGA-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v4, v0
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v1, v0
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
@@ -118,34 +118,38 @@ define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v1
-; GFX9-NEXT:    v_add_u32_e32 v5, 1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v3
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    s_xor_b32 s3, s1, s3
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    s_sub_i32 s1, 0, s2
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s1, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT:    s_add_i32 s8, s8, s1
+; GFX9-NEXT:    s_mul_hi_u32 s1, s0, s8
+; GFX9-NEXT:    s_mul_i32 s8, s1, s2
+; GFX9-NEXT:    s_sub_i32 s0, s0, s8
+; GFX9-NEXT:    s_add_i32 s9, s1, 1
+; GFX9-NEXT:    s_sub_i32 s8, s0, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX9-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s8, s0
+; GFX9-NEXT:    s_add_i32 s8, s1, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX9-NEXT:    s_cselect_b32 s0, s8, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s3
+; GFX9-NEXT:    s_sub_i32 s0, s0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -308,7 +312,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_hi_i32 v1, v0, s2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
@@ -331,7 +335,7 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrsp
 ; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_mul_hi_i32 v1, v0, s2
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
@@ -432,22 +436,22 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_mul_hi_u32 v4, v5, v10
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -458,7 +462,7 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v8, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v9, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -505,8 +509,8 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v10
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v7, v6
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v7
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
@@ -517,10 +521,10 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
+; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
 ; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
 ; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -538,75 +542,83 @@ define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; GFX9-LABEL: sdiv_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_mov_b32 s10, s6
-; GFX9-NEXT:    s_mov_b32 s11, s7
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s8, s2
-; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; GFX9-NEXT:    v_sub_u32_e32 v10, 0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v11, 0, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v9
-; GFX9-NEXT:    v_mul_lo_u32 v10, v10, v6
-; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v7
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX9-NEXT:    v_xor_b32_e32 v4, v8, v4
-; GFX9-NEXT:    v_xor_b32_e32 v5, v9, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v11
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v2
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v3
-; GFX9-NEXT:    v_add_u32_e32 v10, 1, v6
-; GFX9-NEXT:    v_add_u32_e32 v11, 1, v7
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
-; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v8, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v9, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v8, 1, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[0:1]
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v9, 1, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v5
-; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
+; GFX9-NEXT:    s_add_i32 s0, s0, s1
+; GFX9-NEXT:    s_xor_b32 s6, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX9-NEXT:    s_add_i32 s7, s7, s8
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    s_xor_b32 s9, s8, s1
+; GFX9-NEXT:    s_xor_b32 s1, s7, s8
+; GFX9-NEXT:    s_sub_i32 s7, 0, s6
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s1, s8
+; GFX9-NEXT:    s_mul_i32 s8, s7, s6
+; GFX9-NEXT:    s_sub_i32 s1, s1, s8
+; GFX9-NEXT:    s_add_i32 s10, s7, 1
+; GFX9-NEXT:    s_sub_i32 s8, s1, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX9-NEXT:    s_cselect_b32 s7, s10, s7
+; GFX9-NEXT:    s_cselect_b32 s1, s8, s1
+; GFX9-NEXT:    s_add_i32 s8, s7, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s7
+; GFX9-NEXT:    s_ashr_i32 s7, s4, 31
+; GFX9-NEXT:    s_add_i32 s4, s4, s7
+; GFX9-NEXT:    s_xor_b32 s4, s4, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s6, s6, s9
+; GFX9-NEXT:    s_add_i32 s5, s5, s8
+; GFX9-NEXT:    s_xor_b32 s7, s8, s7
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s6, s6, s9
+; GFX9-NEXT:    s_xor_b32 s5, s5, s8
+; GFX9-NEXT:    s_sub_i32 s8, 0, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_i32 s9, s9, s8
+; GFX9-NEXT:    s_mul_hi_u32 s8, s5, s9
+; GFX9-NEXT:    s_mul_i32 s9, s8, s4
+; GFX9-NEXT:    s_sub_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s8, 1
+; GFX9-NEXT:    s_sub_i32 s9, s5, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s5
+; GFX9-NEXT:    s_add_i32 s9, s8, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s8
+; GFX9-NEXT:    s_xor_b32 s4, s4, s7
+; GFX9-NEXT:    s_sub_i32 s4, s4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v2i32:
@@ -834,7 +846,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
@@ -865,19 +877,19 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
 ; GCN-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
 ; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v4
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
 ; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v10, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
@@ -893,7 +905,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v15, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v16, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v9, v12
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
@@ -909,12 +921,12 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v17, v2
 ; GCN-NEXT:    v_xor_b32_e32 v7, v8, v14
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v4, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v8, vcc, v4, v3
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
@@ -965,7 +977,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v5
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
 ; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
@@ -996,9 +1008,9 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; TONGA-NEXT:    v_mul_hi_u32 v7, v1, v7
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v10
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
 ; TONGA-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; TONGA-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v12, v12
@@ -1008,7 +1020,7 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v5
 ; TONGA-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v10, v2
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
 ; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v7
 ; TONGA-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[2:3]
 ; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
@@ -1040,149 +1052,165 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v7, v8, v14
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v4, v3
-; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; TONGA-NEXT:    v_subrev_u32_e32 v8, vcc, v4, v3
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; TONGA-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
+; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v7, v3
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s11, 0xf000
-; GFX9-NEXT:    s_mov_b32 s10, -1
-; GFX9-NEXT:    s_mov_b32 s6, s10
-; GFX9-NEXT:    s_mov_b32 s7, s11
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, s2
-; GFX9-NEXT:    s_mov_b32 s5, s3
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    s_mov_b32 s8, s0
-; GFX9-NEXT:    s_mov_b32 s9, s1
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX9-NEXT:    s_add_i32 s1, s1, s4
+; GFX9-NEXT:    s_xor_b32 s6, s1, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v9
-; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
-; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v9
-; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; GFX9-NEXT:    v_xor_b32_e32 v16, v8, v9
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v4
-; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v10
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v13
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v11
-; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v12
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v15
-; GFX9-NEXT:    v_xor_b32_e32 v17, v10, v11
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v10
-; GFX9-NEXT:    v_xor_b32_e32 v6, v6, v13
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, v5
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v14
-; GFX9-NEXT:    v_xor_b32_e32 v18, v12, v13
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v12
-; GFX9-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; GFX9-NEXT:    v_xor_b32_e32 v19, v14, v15
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v14
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, v7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v14
-; GFX9-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; GFX9-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GFX9-NEXT:    v_sub_u32_e32 v9, 0, v4
-; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
-; GFX9-NEXT:    v_mul_lo_u32 v9, v9, v8
-; GFX9-NEXT:    v_sub_u32_e32 v11, 0, v5
-; GFX9-NEXT:    v_sub_u32_e32 v13, 0, v6
-; GFX9-NEXT:    v_mul_lo_u32 v11, v11, v10
-; GFX9-NEXT:    v_sub_u32_e32 v15, 0, v7
-; GFX9-NEXT:    v_mul_lo_u32 v13, v13, v12
-; GFX9-NEXT:    v_mul_lo_u32 v15, v15, v14
-; GFX9-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GFX9-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GFX9-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GFX9-NEXT:    v_mul_hi_u32 v15, v14, v15
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v9, v10, v11
-; GFX9-NEXT:    v_add_u32_e32 v10, v12, v13
-; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v9
-; GFX9-NEXT:    v_add_u32_e32 v11, v14, v15
-; GFX9-NEXT:    v_mul_hi_u32 v10, v2, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v11
-; GFX9-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GFX9-NEXT:    v_mul_lo_u32 v14, v9, v5
-; GFX9-NEXT:    v_mul_lo_u32 v15, v10, v6
-; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v12
-; GFX9-NEXT:    v_mul_lo_u32 v12, v11, v7
-; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v14
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v13, v0, v4
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v5
-; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v14, v1, v5
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[2:3], v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
-; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[2:3]
-; GFX9-NEXT:    v_sub_u32_e32 v15, v2, v6
-; GFX9-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v7
-; GFX9-NEXT:    v_add_u32_e32 v13, 1, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v14, s[0:1]
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
-; GFX9-NEXT:    v_sub_u32_e32 v12, v3, v7
-; GFX9-NEXT:    v_add_u32_e32 v14, 1, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v15, s[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; GFX9-NEXT:    v_add_u32_e32 v15, 1, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v14, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; GFX9-NEXT:    v_add_u32_e32 v12, 1, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v15, vcc
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v16
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v17
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v18
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v19
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v16
-; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v17
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v18
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v19
-; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v4
+; GFX9-NEXT:    s_ashr_i32 s9, s8, 31
+; GFX9-NEXT:    s_add_i32 s8, s8, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s4, s9, s4
+; GFX9-NEXT:    s_xor_b32 s8, s8, s9
+; GFX9-NEXT:    s_sub_i32 s9, 0, s6
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v0
+; GFX9-NEXT:    s_mul_i32 s9, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s9
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s10
+; GFX9-NEXT:    s_mul_i32 s10, s9, s6
+; GFX9-NEXT:    s_sub_i32 s8, s8, s10
+; GFX9-NEXT:    s_add_i32 s11, s9, 1
+; GFX9-NEXT:    s_sub_i32 s10, s8, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
+; GFX9-NEXT:    s_cselect_b32 s9, s11, s9
+; GFX9-NEXT:    s_cselect_b32 s8, s10, s8
+; GFX9-NEXT:    s_add_i32 s10, s9, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
+; GFX9-NEXT:    s_cselect_b32 s6, s10, s9
+; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX9-NEXT:    s_add_i32 s7, s7, s8
+; GFX9-NEXT:    s_xor_b32 s7, s7, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v5
+; GFX9-NEXT:    s_ashr_i32 s11, s10, 31
+; GFX9-NEXT:    s_xor_b32 s6, s6, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_xor_b32 s8, s11, s8
+; GFX9-NEXT:    s_sub_i32 s4, s6, s4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s6, s10, s11
+; GFX9-NEXT:    s_sub_i32 s10, 0, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX9-NEXT:    s_mul_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s10, s11, s10
+; GFX9-NEXT:    s_add_i32 s11, s11, s10
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s11
+; GFX9-NEXT:    s_mul_i32 s11, s10, s7
+; GFX9-NEXT:    s_sub_i32 s6, s6, s11
+; GFX9-NEXT:    s_add_i32 s12, s10, 1
+; GFX9-NEXT:    s_sub_i32 s11, s6, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s7
+; GFX9-NEXT:    s_cselect_b32 s10, s12, s10
+; GFX9-NEXT:    s_cselect_b32 s6, s11, s6
+; GFX9-NEXT:    s_add_i32 s11, s10, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s7
+; GFX9-NEXT:    s_cselect_b32 s6, s11, s10
+; GFX9-NEXT:    s_ashr_i32 s7, s9, 31
+; GFX9-NEXT:    s_add_i32 s9, s9, s7
+; GFX9-NEXT:    s_xor_b32 s9, s9, s7
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v6
+; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX9-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_add_i32 s11, s11, s12
+; GFX9-NEXT:    s_xor_b32 s7, s12, s7
+; GFX9-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s8, s11, s12
+; GFX9-NEXT:    s_sub_i32 s11, 0, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX9-NEXT:    s_mul_i32 s11, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s11
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s12
+; GFX9-NEXT:    s_mul_i32 s12, s11, s9
+; GFX9-NEXT:    s_sub_i32 s8, s8, s12
+; GFX9-NEXT:    s_add_i32 s13, s11, 1
+; GFX9-NEXT:    s_sub_i32 s12, s8, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s8, s9
+; GFX9-NEXT:    s_cselect_b32 s11, s13, s11
+; GFX9-NEXT:    s_cselect_b32 s8, s12, s8
+; GFX9-NEXT:    s_add_i32 s12, s11, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s8, s9
+; GFX9-NEXT:    s_cselect_b32 s8, s12, s11
+; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s9
+; GFX9-NEXT:    s_xor_b32 s5, s5, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    s_ashr_i32 s4, s10, 31
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    s_xor_b32 s6, s8, s7
+; GFX9-NEXT:    s_xor_b32 s8, s4, s9
+; GFX9-NEXT:    s_sub_i32 s6, s6, s7
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    s_sub_i32 s7, 0, s5
+; GFX9-NEXT:    s_add_i32 s10, s10, s4
+; GFX9-NEXT:    s_xor_b32 s4, s10, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX9-NEXT:    s_mul_i32 s7, s7, s9
+; GFX9-NEXT:    s_mul_hi_u32 s7, s9, s7
+; GFX9-NEXT:    s_add_i32 s9, s9, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s9
+; GFX9-NEXT:    s_mul_i32 s9, s7, s5
+; GFX9-NEXT:    s_sub_i32 s4, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s7, 1
+; GFX9-NEXT:    s_sub_i32 s9, s4, s5
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX9-NEXT:    s_cselect_b32 s7, s10, s7
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX9-NEXT:    s_add_i32 s9, s7, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s5
+; GFX9-NEXT:    s_cselect_b32 s4, s9, s7
+; GFX9-NEXT:    s_xor_b32 s4, s4, s8
+; GFX9-NEXT:    s_sub_i32 s4, s4, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; EG-LABEL: sdiv_v4i32:
@@ -1824,7 +1852,7 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -2001,11 +2029,11 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -2049,11 +2077,11 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v2, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
+; TONGA-NEXT:    v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -2077,37 +2105,41 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; GFX9-NEXT:    s_mov_b32 s4, s0
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfe_i32 v2, v1, 0, 25
-; GFX9-NEXT:    v_bfe_i32 v1, v1, 24, 1
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
-; GFX9-NEXT:    v_bfe_i32 v5, v0, 0, 25
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 24, 1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v0
-; GFX9-NEXT:    v_xor_b32_e32 v5, v5, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, v4, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    s_bfe_i32 s3, s2, 0x190000
+; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x10018
+; GFX9-NEXT:    s_add_i32 s3, s3, s2
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x190000
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x10018
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v1
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_xor_b32 s2, s0, s2
+; GFX9-NEXT:    s_xor_b32 s0, s1, s0
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s1, 0, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s1, s1, s8
+; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT:    s_add_i32 s8, s8, s1
+; GFX9-NEXT:    s_mul_hi_u32 s1, s0, s8
+; GFX9-NEXT:    s_mul_i32 s8, s1, s3
+; GFX9-NEXT:    s_sub_i32 s0, s0, s8
+; GFX9-NEXT:    s_add_i32 s9, s1, 1
+; GFX9-NEXT:    s_sub_i32 s8, s0, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX9-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s8, s0
+; GFX9-NEXT:    s_add_i32 s8, s1, 1
+; GFX9-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX9-NEXT:    s_cselect_b32 s0, s8, s1
+; GFX9-NEXT:    s_xor_b32 s0, s0, s2
+; GFX9-NEXT:    s_sub_i32 s0, s0, s2
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x190000
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2252,10 +2284,10 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
 ; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 8174daa9144ab..c2454772ebfaf 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -59,9 +59,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -111,12 +111,13 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
@@ -126,10 +127,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GCN-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
@@ -142,6 +142,7 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
@@ -153,37 +154,39 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[12:13], 0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
+; GCN-IR-NEXT:    s_min_u32 s18, s8, s9
+; GCN-IR-NEXT:    s_sub_u32 s16, s14, s18
+; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[16:17], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[16:17], 63
+; GCN-IR-NEXT:    s_or_b64 s[20:21], s[10:11], s[20:21]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[20:21], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s13
+; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s12
+; GCN-IR-NEXT:    s_or_b64 s[20:21], s[20:21], s[22:23]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[14:15]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s12
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s13
-; GCN-IR-NEXT:    s_min_u32 s18, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s14, s18
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
-; GCN-IR-NEXT:    s_or_b64 s[20:21], s[16:17], s[22:23]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s17, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[16:17], 0
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GCN-IR-NEXT:    s_add_u32 s20, s16, 1
+; GCN-IR-NEXT:    s_addc_u32 s21, s17, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[20:21], 0
+; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s16
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s16
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s20
 ; GCN-IR-NEXT:    s_add_u32 s19, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s20, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
@@ -214,24 +217,16 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_branch .LBB0_6
-; GCN-IR-NEXT:  .LBB0_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[16:17]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[16:17]
-; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[6:7]
+; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
+; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
@@ -485,15 +480,15 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -515,15 +510,15 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -591,16 +586,16 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_xor_b32 s0, s3, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -618,16 +613,16 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_xor_b32 s0, s3, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -655,15 +650,15 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -685,15 +680,15 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -722,15 +717,15 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -752,15 +747,15 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -789,15 +784,15 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -819,15 +814,15 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -853,35 +848,35 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    s_or_b32 s7, s4, 1
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s7, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s10
-; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-NEXT:    s_xor_b32 s4, s6, s10
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s4, s4, 1
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    s_or_b32 s6, s4, 1
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -899,35 +894,35 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-IR-NEXT:    s_or_b32 s7, s4, 1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s7, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, s10
-; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-IR-NEXT:    s_xor_b32 s4, s6, s10
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, s4
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
+; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; GCN-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -966,7 +961,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -977,6 +972,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-IR-NEXT:    s_sext_i32_i16 s1, s1
@@ -996,26 +992,28 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[10:11]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[14:15]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s12
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s13
-; GCN-IR-NEXT:    s_min_u32 s18, s10, s11
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
+; GCN-IR-NEXT:    s_min_u32 s18, s8, s9
 ; GCN-IR-NEXT:    s_sub_u32 s10, s14, s18
 ; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[10:11], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[22:23], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
-; GCN-IR-NEXT:    s_or_b64 s[20:21], s[16:17], s[22:23]
+; GCN-IR-NEXT:    s_or_b64 s[20:21], s[16:17], s[20:21]
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[20:21], exec
+; GCN-IR-NEXT:    s_cselect_b32 s17, 0, s13
+; GCN-IR-NEXT:    s_cselect_b32 s16, 0, s12
+; GCN-IR-NEXT:    s_or_b64 s[20:21], s[20:21], s[22:23]
+; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s10, 1
@@ -1057,25 +1055,18 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
 ; GCN-IR-NEXT:  .LBB9_4: ; %Flow3
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_branch .LBB9_6
-; GCN-IR-NEXT:  .LBB9_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[16:17]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[16:17]
-; GCN-IR-NEXT:  .LBB9_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[6:7]
+; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[16:17], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
+; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
@@ -1176,23 +1167,23 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v4, s[0:1], 2, v0
+; GCN-NEXT:    v_add_i32_e64 v4, s[0:1], 1, v0
 ; GCN-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v6, s[0:1], 1, v0
+; GCN-NEXT:    v_add_i32_e64 v6, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1]
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s8
@@ -1204,36 +1195,39 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-LABEL: s_test_sdiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s5, s4
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s8, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s10, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[8:9], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[14:15]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_add_u32 s12, s10, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s13, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[14:15]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[14:15], exec
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, 24
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s8
+; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
+; GCN-IR-NEXT:    s_addc_u32 s15, s13, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[14:15], 0
+; GCN-IR-NEXT:    s_sub_i32 s11, 63, s12
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s11
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s12
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s14
 ; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
 ; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
@@ -1263,21 +1257,15 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_3
 ; GCN-IR-NEXT:  .LBB10_4: ; %Flow5
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_branch .LBB10_6
-; GCN-IR-NEXT:  .LBB10_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[12:13]
-; GCN-IR-NEXT:  .LBB10_6: ; %udiv-end
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s5, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB10_5: ; %udiv-end
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
+; GCN-IR-NEXT:    s_sub_u32 s4, s6, s4
+; GCN-IR-NEXT:    s_subb_u32 s5, s7, s5
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
@@ -1788,16 +1776,16 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1815,16 +1803,16 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1847,17 +1835,17 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s2, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v0, -v1, s8, v0
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s8
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -1873,17 +1861,17 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 30
-; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s8, v0
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s8
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
index 4e2f8afba64a1..772f07f3aecd0 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -9,23 +9,24 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s0, 1, s2
-; GCN-NEXT:    s_ff1_i32_b32 s0, s0
+; GCN-NEXT:    s_lshr_b32 s4, 1, s2
 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[2:3]
-; GCN-NEXT:    v_ffbh_i32_e32 v1, v0
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 31, v1
-; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[0:1]
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_ff1_i32_b32 s2, s4
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s2, -1, s2
+; GCN-NEXT:    s_flbit_i32 s6, s2
+; GCN-NEXT:    s_sub_i32 s8, 31, s6
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s4, -1, s8
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %v    = load i32, i32 addrspace(1)* %arrayidx, align 4
   %sr   = lshr i32 1, %v

diff  --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 07ccf84c70feb..454a45d6365da 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -9,10 +9,11 @@
 ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN-DAG: s_cmp_lg_u32
 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
-; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]]
+; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec
+; GCN: s_cselect_b32 [[RESULT:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; GCN: buffer_store_dword [[VRESULT]]
 define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
@@ -25,10 +26,11 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i3
 ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 vcc, vcc, [[CMP1]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_and_b64 [[CMP1]], vcc, [[CMP1]]
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[CMP1]], exec
+; GCN: s_cselect_b32 [[RESULT:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; GCN: buffer_store_dword [[VRESULT]]
 define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
@@ -43,10 +45,13 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, fl
 ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN-DAG: s_cmp_lg_u32
 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
-; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
-; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
+; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]]
+; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]]
+; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
+; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
 define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
@@ -59,10 +64,13 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i3
 ; GCN-LABEL: {{^}}opt_select_i64_and_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc,
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 vcc, vcc, [[CMP1]]
-; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
+; GCN: s_and_b64 [[AND1:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]]
+; GCN: s_and_b64 [[AND2:s\[[0-9]+:[0-9]+\]]], [[AND1]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]]
+; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
+; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
 define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
@@ -77,10 +85,11 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, fl
 ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN-DAG: s_cmp_lg_u32
 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
-; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]]
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; GCN: buffer_store_dword [[VRESULT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
@@ -94,10 +103,11 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 vcc, vcc, [[CMP1]]
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]]
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; GCN: buffer_store_dword [[VRESULT]]
 define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
@@ -112,10 +122,13 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, flo
 ; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN-DAG: s_cmp_lg_u32
 ; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
-; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
-; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
+; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP2]]
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]]
+; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
+; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
 define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
@@ -128,10 +141,13 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}opt_select_i64_or_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc,
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 vcc, vcc, [[CMP1]]
-; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
-; GCN: buffer_store_dwordx2 v[[[RESULT0]]:[[RESULT1]]]
+; GCN: s_or_b64 [[OR:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP1]]
+; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[OR]], exec
+; GCN-DAG: s_cselect_b32 [[RESULT0:s[0-9]+]]
+; GCN-DAG: s_cselect_b32 [[RESULT1:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT1:[0-9]+]], [[RESULT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]]
+; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]]
 define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c

diff  --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index aaaa68f571325..efaf2b2e1c469 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -69,7 +69,7 @@ define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8
 ; GFX89: s_cselect_b32
 ; GFX89-NOT: s_cselect_b32
 
-; SI: v_cndmask_b32
+; SI: s_cselect_b32
 ; SI-NOT: cndmask
 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
   %cmp = icmp eq i8 %c, 0
@@ -83,7 +83,7 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a,
 ; GFX89: s_cselect_b32
 ; GFX89-NOT: s_cselect_b32
 
-; SI: v_cndmask_b32_e32
+; SI: s_cselect_b32
 ; SI-NOT: v_cndmask_b32e
 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
@@ -111,10 +111,10 @@ define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; SI: cndmask
 ; SI-NOT: cndmask
 
-; GFX89: v_cndmask_b32_e32
-; GFX89: cndmask
-; VI: cndmask
-; GFX89-NOT: cndmask
+; VI: s_cselect_b32
+; VI: s_cselect_b32
+; GFX9: cndmask
+; GFX9: cndmask
 define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
   %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
   %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
@@ -156,8 +156,8 @@ define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16
 ; vector select with SGPR inputs.
 
 ; GCN-LABEL: {{^}}s_select_v2i32:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 ; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
@@ -167,10 +167,10 @@ define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
 }
 
 ; GCN-LABEL: {{^}}s_select_v4i32:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
@@ -198,14 +198,14 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}select_v8i32:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
@@ -214,15 +214,9 @@ define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 }
 
 ; GCN-LABEL: {{^}}s_select_v2f32:
-; GCN-DAG: s_load_dwordx4 s[[[ALO:[0-9]+]]:[[BHI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
-
-; GCN-DAG: v_cndmask_b32_e32
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cndmask_b32_e32
+; GCN-DAG: s_cselect_b32
+; GCN-DAG: s_cselect_b32
 ; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
@@ -234,9 +228,9 @@ define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x f
 ; GCN-LABEL: {{^}}s_select_v3f32:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx
 define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 {
@@ -250,10 +244,10 @@ define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x f
 ; GCN: s_load_dwordx8
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
@@ -284,11 +278,11 @@ bb:
 ; GCN-LABEL: {{^}}s_select_v5f32:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 
 ; GCN: buffer_store_dwordx
 define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 {
@@ -315,10 +309,10 @@ define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x flo
 }
 
 ; GCN-LABEL: {{^}}select_v2f64:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
@@ -327,14 +321,14 @@ define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x do
 }
 
 ; GCN-LABEL: {{^}}select_v4f64:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
@@ -343,22 +337,22 @@ define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x do
 }
 
 ; GCN-LABEL: {{^}}select_v8f64:
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b

diff  --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 61ce9c12526d4..0224708fb586c 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}select0:
 ; i64 select should be split into two i32 selects, and we shouldn't need
 ; to use a shfit to extract the hi dword of the input.
 ; GCN-NOT: s_lshr_b64
-; GCN: v_cndmask
-; GCN: v_cndmask
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
 define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
@@ -16,10 +16,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}select_trunc_i64:
-; VI: s_cselect_b32
-; VI-NOT: s_cselect_b32
-; SI: v_cndmask_b32
-; SI-NOT: v_cndmask_b32
+; GCN: s_cselect_b32
+; GCN-NOT: s_cselect_b32
 define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
@@ -29,10 +27,8 @@ define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i
 }
 
 ; GCN-LABEL: {{^}}select_trunc_i64_2:
-; VI: s_cselect_b32
-; VI-NOT: s_cselect_b32
-; SI: v_cndmask_b32
-; SI-NOT: v_cndmask_b32
+; GCN: s_cselect_b32
+; GCN-NOT: s_cselect_b32
 define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
@@ -42,10 +38,8 @@ define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond,
 }
 
 ; GCN-LABEL: {{^}}v_select_trunc_i64_2:
-; VI: s_cselect_b32
-; VI-NOT: s_cselect_b32
-; SI: v_cndmask_b32
-; SI-NOT: v_cndmask_b32
+; GCN: s_cselect_b32
+; GCN-NOT: s_cselect_b32
 define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
@@ -57,8 +51,8 @@ define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %con
 }
 
 ; GCN-LABEL: {{^}}v_select_i64_split_imm:
-; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
-; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
+; GCN-DAG: s_cselect_b32
+; GCN-DAG: s_cselect_b32
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5

diff  --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index 54a26a4cf676a..b743ba3ba8879 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -10,8 +10,7 @@
 ; EG: CNDE_INT
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
-; GCN: v_cndmask
-; GCN: v_cndmask
+; GCN: s_cselect_b32
 define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs

diff  --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 718cf7015a4a3..5b7099ff7ed2d 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -261,8 +261,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}i128_sle:
-; GCN: v_cmp_le_u64
 ; GCN: v_cmp_le_i64
+; CGV: v_cndmask
 ; SI: v_cmp_eq_u64
 ; VI: s_cmp_eq_u64
 define amdgpu_kernel void @i128_sle(i32 addrspace(1)* %out, i128 %a, i128 %b) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index ed0fc095ba5d4..403e553f8bec9 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -183,35 +183,29 @@ define i128 @v_ashr_i128_kv(i128 %rhs) {
 define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s9, 64, s8
-; GCN-NEXT:    s_sub_i32 s2, s8, 64
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[6:7], s8
-; GCN-NEXT:    s_lshr_b64 s[10:11], s[4:5], s9
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], s2
-; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
-; GCN-NEXT:    s_cmp_lt_u32 s8, 64
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[4:5], s8
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_sub_i32 s5, s4, 64
+; GCN-NEXT:    s_sub_i32 s12, 64, s4
+; GCN-NEXT:    s_lshl_b64 s[6:7], s[2:3], s4
+; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s4
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], s12
+; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
+; GCN-NEXT:    s_cmp_lt_u32 s4, 64
+; GCN-NEXT:    s_cselect_b32 s0, s0, s10
+; GCN-NEXT:    s_cselect_b32 s1, s1, s11
+; GCN-NEXT:    s_cselect_b32 s5, s9, 0
+; GCN-NEXT:    s_cselect_b32 s6, s8, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s1, s3, s1
+; GCN-NEXT:    s_cselect_b32 s0, s2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = shl i128 %lhs, %rhs
@@ -222,35 +216,29 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sub_i32 s9, 64, s8
-; GCN-NEXT:    s_sub_i32 s2, s8, 64
-; GCN-NEXT:    s_lshr_b64 s[0:1], s[4:5], s8
-; GCN-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s2
-; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
-; GCN-NEXT:    s_cmp_lt_u32 s8, 64
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT:    s_sub_i32 s5, s4, 64
+; GCN-NEXT:    s_sub_i32 s12, 64, s4
+; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GCN-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
+; GCN-NEXT:    s_lshr_b64 s[10:11], s[2:3], s5
+; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s12
+; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    s_cmp_lt_u32 s4, 64
+; GCN-NEXT:    s_cselect_b32 s2, s2, s10
+; GCN-NEXT:    s_cselect_b32 s3, s3, s11
+; GCN-NEXT:    s_cselect_b32 s5, s9, 0
+; GCN-NEXT:    s_cselect_b32 s6, s8, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s1, s1, s3
+; GCN-NEXT:    s_cselect_b32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = lshr i128 %lhs, %rhs
@@ -261,37 +249,30 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s7, 31
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], s8
-; GCN-NEXT:    s_cmp_lt_u32 s8, 64
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-NEXT:    s_sub_i32 s0, s8, 64
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], s0
-; GCN-NEXT:    s_sub_i32 s0, 64, s8
-; GCN-NEXT:    s_lshl_b64 s[0:1], s[6:7], s0
-; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_cmp_eq_u32 s8, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v6, s4
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s5, 64, s4
+; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
+; GCN-NEXT:    s_sub_i32 s10, s4, 64
+; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], s5
+; GCN-NEXT:    s_ashr_i32 s12, s3, 31
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[2:3], s10
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s4
+; GCN-NEXT:    s_cmp_lt_u32 s4, 64
+; GCN-NEXT:    s_cselect_b32 s3, s3, s12
+; GCN-NEXT:    s_cselect_b32 s2, s2, s12
+; GCN-NEXT:    s_cselect_b32 s5, s6, s10
+; GCN-NEXT:    s_cselect_b32 s6, s7, s11
+; GCN-NEXT:    s_cmp_eq_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s1, s1, s6
+; GCN-NEXT:    s_cselect_b32 s0, s0, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = ashr i128 %lhs, %rhs
@@ -451,66 +432,69 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v10, 16
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v6, 16
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s22, 64, s12
-; GCN-NEXT:    s_sub_i32 s20, s12, 64
-; GCN-NEXT:    s_lshr_b64 s[22:23], s[4:5], s22
-; GCN-NEXT:    s_lshl_b64 s[24:25], s[6:7], s12
-; GCN-NEXT:    s_lshl_b64 s[20:21], s[4:5], s20
-; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NEXT:    v_mov_b32_e32 v1, s22
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
-; GCN-NEXT:    s_sub_i32 s13, 64, s16
-; GCN-NEXT:    s_sub_i32 s6, s16, 64
-; GCN-NEXT:    s_lshr_b64 s[14:15], s[8:9], s13
-; GCN-NEXT:    s_lshl_b64 s[20:21], s[10:11], s16
-; GCN-NEXT:    s_lshl_b64 s[6:7], s[8:9], s6
-; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s8
+; GCN-NEXT:    s_sub_i32 s20, s8, 64
+; GCN-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
+; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GCN-NEXT:    s_lshl_b64 s[18:19], s[2:3], s8
+; GCN-NEXT:    s_lshl_b64 s[20:21], s[0:1], s20
+; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s19, s19, s21
+; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
+; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s9, s3, s19
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s3, s18, s20
+; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
+; GCN-NEXT:    s_cselect_b32 s22, s2, s3
+; GCN-NEXT:    s_and_b64 s[2:3], s[18:19], s[10:11]
+; GCN-NEXT:    s_sub_i32 s18, 64, s12
+; GCN-NEXT:    s_sub_i32 s10, s12, 64
+; GCN-NEXT:    s_lshr_b64 s[18:19], s[4:5], s18
+; GCN-NEXT:    s_lshl_b64 s[20:21], s[6:7], s12
+; GCN-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
+; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
+; GCN-NEXT:    s_and_b64 s[20:21], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s11, s19, s11
+; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
+; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s13, s7, s11
+; GCN-NEXT:    s_and_b64 s[20:21], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s7, s18, s10
+; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s10, s6, s7
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; GCN-NEXT:    s_and_b64 s[6:7], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s6, s1, 0
+; GCN-NEXT:    s_cselect_b32 s7, s0, 0
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[4:5], s12
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s1, s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s0, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v0, v1, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v0, v1, s[2:3]
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], s12
-; GCN-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[8:9], s16
-; GCN-NEXT:    v_mov_b32_e32 v4, s3
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = shl <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -520,66 +504,69 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v10, 16
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v6, 16
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s22, 64, s12
-; GCN-NEXT:    s_sub_i32 s20, s12, 64
-; GCN-NEXT:    s_lshl_b64 s[22:23], s[6:7], s22
-; GCN-NEXT:    s_lshr_b64 s[24:25], s[4:5], s12
-; GCN-NEXT:    s_lshr_b64 s[20:21], s[6:7], s20
-; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
-; GCN-NEXT:    s_sub_i32 s13, 64, s16
-; GCN-NEXT:    s_sub_i32 s4, s16, 64
-; GCN-NEXT:    s_lshl_b64 s[14:15], s[10:11], s13
-; GCN-NEXT:    s_lshr_b64 s[20:21], s[8:9], s16
-; GCN-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
-; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s14
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[2:3]
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s8
+; GCN-NEXT:    s_sub_i32 s20, s8, 64
+; GCN-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
+; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GCN-NEXT:    s_lshr_b64 s[18:19], s[0:1], s8
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[2:3], s20
+; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s19, s19, s21
+; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
+; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s9, s1, s19
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s1, s18, s20
+; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
+; GCN-NEXT:    s_cselect_b32 s22, s0, s1
+; GCN-NEXT:    s_and_b64 s[0:1], s[18:19], s[10:11]
+; GCN-NEXT:    s_sub_i32 s18, 64, s12
+; GCN-NEXT:    s_sub_i32 s10, s12, 64
+; GCN-NEXT:    s_lshl_b64 s[18:19], s[6:7], s18
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[4:5], s12
+; GCN-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
+; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
+; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s11, s19, s11
+; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
+; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s13, s5, s11
+; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s5, s18, s10
+; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s10, s4, s5
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
+; GCN-NEXT:    s_and_b64 s[4:5], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s4, s3, 0
+; GCN-NEXT:    s_cselect_b32 s5, s2, 0
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s12
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s16
-; GCN-NEXT:    v_mov_b32_e32 v6, s3
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s3, 0
+; GCN-NEXT:    s_cselect_b32 s1, s2, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = lshr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -589,70 +576,71 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v10, 16
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v6, 16
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
-; GCN-NEXT:    s_sub_i32 s22, 64, s12
-; GCN-NEXT:    s_sub_i32 s20, s12, 64
-; GCN-NEXT:    s_lshl_b64 s[22:23], s[6:7], s22
-; GCN-NEXT:    s_lshr_b64 s[24:25], s[4:5], s12
-; GCN-NEXT:    s_ashr_i64 s[20:21], s[6:7], s20
-; GCN-NEXT:    s_or_b64 s[22:23], s[24:25], s[22:23]
-; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[0:1], s[12:13], s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v0, s21
-; GCN-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
-; GCN-NEXT:    s_sub_i32 s13, 64, s16
-; GCN-NEXT:    s_sub_i32 s4, s16, 64
-; GCN-NEXT:    s_lshl_b64 s[14:15], s[10:11], s13
-; GCN-NEXT:    s_lshr_b64 s[20:21], s[8:9], s16
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[10:11], s4
-; GCN-NEXT:    s_or_b64 s[14:15], s[20:21], s[14:15]
-; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, v3, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s14
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[2:3]
-; GCN-NEXT:    s_ashr_i32 s4, s7, 31
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
+; GCN-NEXT:    s_sub_i32 s22, 64, s8
+; GCN-NEXT:    s_sub_i32 s20, s8, 64
+; GCN-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
+; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
+; GCN-NEXT:    s_lshr_b64 s[18:19], s[0:1], s8
+; GCN-NEXT:    s_ashr_i64 s[20:21], s[2:3], s20
+; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s19, s19, s21
+; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
+; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
+; GCN-NEXT:    s_cselect_b32 s9, s1, s19
+; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s1, s18, s20
+; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
+; GCN-NEXT:    s_cselect_b32 s22, s0, s1
+; GCN-NEXT:    s_and_b64 s[0:1], s[18:19], s[10:11]
+; GCN-NEXT:    s_sub_i32 s18, 64, s12
+; GCN-NEXT:    s_sub_i32 s10, s12, 64
+; GCN-NEXT:    s_lshl_b64 s[18:19], s[6:7], s18
+; GCN-NEXT:    s_lshr_b64 s[20:21], s[4:5], s12
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[6:7], s10
+; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
+; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s11, s19, s11
+; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
+; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s13, s5, s11
+; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s5, s18, s10
+; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
+; GCN-NEXT:    s_cselect_b32 s10, s4, s5
+; GCN-NEXT:    s_ashr_i32 s11, s3, 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s8
+; GCN-NEXT:    s_and_b64 s[4:5], s[16:17], exec
+; GCN-NEXT:    s_cselect_b32 s4, s3, s11
+; GCN-NEXT:    s_cselect_b32 s5, s2, s11
+; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], s12
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-NEXT:    s_ashr_i32 s4, s11, 31
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], s16
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GCN-NEXT:    v_mov_b32_e32 v6, s4
-; GCN-NEXT:    v_mov_b32_e32 v7, s3
-; GCN-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v6, v7, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[0:1]
-; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s3, s8
+; GCN-NEXT:    s_cselect_b32 s1, s2, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = ashr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null

diff  --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index c5ceb23337577..af290aa914adb 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -15,19 +15,12 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32
 ; uses an SGPR (implicit vcc).
 
 ; GCN-LABEL: {{^}}sint_to_fp_i1_f64:
-; VI-DAG: s_cmp_eq_u32
-; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
-; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
-; VI: s_endpgm
-
-; CI-DAG: s_cmp_eq_u32
-; CI-DAG: s_cselect_b64 vcc, -1, 0
-; CI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc
-; CI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; CI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
-; CI: s_endpgm
+; GCN-DAG: s_cmp_eq_u32
+; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
+; GCN: s_endpgm
 define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index bdedff04198ea..f3ee132041722 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,7 +51,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -104,19 +104,19 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -124,38 +124,41 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s4
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s5
-; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[8:9], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[12:13], s[18:19]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[16:17]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
+; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
@@ -186,30 +189,24 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_branch .LBB0_6
-; GCN-IR-NEXT:  .LBB0_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s4, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-IR-NEXT:    s_mov_b32 s12, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s4, s9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s5, s8
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s4, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
-; GCN-IR-NEXT:    s_mov_b32 s8, s0
-; GCN-IR-NEXT:    s_mov_b32 s9, s1
+; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s14, -1
+; GCN-IR-NEXT:    s_mov_b32 s13, s1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -451,66 +448,72 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem23_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 41
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 41
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 41
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-NEXT:    s_or_b32 s3, s3, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-NEXT:    s_add_i32 s5, s6, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-NEXT:    s_bfe_i32 s4, s4, 0x170000
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem23_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 41
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 41
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 41
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-IR-NEXT:    s_add_i32 s5, s6, s5
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-IR-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-IR-NEXT:    s_bfe_i32 s4, s4, 0x170000
+; GCN-IR-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
@@ -522,66 +525,72 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem24_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 40
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-NEXT:    s_or_b32 s3, s3, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-NEXT:    s_add_i32 s5, s6, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 40
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-IR-NEXT:    s_add_i32 s5, s6, s5
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-IR-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-IR-NEXT:    s_bfe_i32 s4, s4, 0x180000
+; GCN-IR-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
@@ -647,66 +656,72 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 39
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 39
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-NEXT:    s_or_b32 s3, s3, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-NEXT:    s_add_i32 s5, s6, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-NEXT:    s_bfe_i32 s4, s4, 0x190000
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem25_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 39
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 39
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 39
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-IR-NEXT:    s_add_i32 s5, s6, s5
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-IR-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-IR-NEXT:    s_bfe_i32 s4, s4, 0x190000
+; GCN-IR-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
@@ -718,66 +733,72 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[4:5], 33
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 33
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-NEXT:    s_or_b32 s3, s3, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-NEXT:    s_add_i32 s5, s6, s5
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-NEXT:    s_bfe_i32 s4, s4, 0x1f0000
+; GCN-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem31_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[4:5], 33
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 33
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 33
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
-; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-IR-NEXT:    s_xor_b32 s5, s4, s8
+; GCN-IR-NEXT:    s_ashr_i32 s5, s5, 30
+; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-IR-NEXT:    s_cselect_b32 s5, s5, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s6, v2
+; GCN-IR-NEXT:    s_add_i32 s5, s6, s5
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-IR-NEXT:    s_sub_i32 s4, s4, s5
+; GCN-IR-NEXT:    s_bfe_i32 s4, s4, 0x1f0000
+; GCN-IR-NEXT:    s_ashr_i32 s5, s4, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
@@ -790,28 +811,28 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s4, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
-; GCN-NEXT:    s_xor_b32 s2, s3, s4
+; GCN-NEXT:    s_xor_b32 s2, s3, s8
 ; GCN-NEXT:    s_ashr_i32 s2, s2, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s2, s2, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -819,28 +840,28 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 ;
 ; GCN-IR-LABEL: s_test_srem32_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
-; GCN-IR-NEXT:    s_xor_b32 s2, s3, s4
+; GCN-IR-NEXT:    s_xor_b32 s2, s3, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s2, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b32 s2, s2, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s2, s2, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -911,7 +932,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -967,19 +988,19 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v5, s15
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v4, s15
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
@@ -992,6 +1013,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[0:1], 31
@@ -1005,37 +1027,39 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
 ; GCN-IR-NEXT:    s_sub_u32 s8, s6, s10
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s10
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
+; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_min_u32 s16, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
+; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s8
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[14:15], s[20:21]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
+; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
+; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s14
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s18
 ; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s9, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[12:13]
@@ -1066,32 +1090,25 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
 ; GCN-IR-NEXT:  .LBB8_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_branch .LBB8_6
-; GCN-IR-NEXT:  .LBB8_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GCN-IR-NEXT:  .LBB8_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-IR-NEXT:    s_mul_i32 s11, s8, s11
+; GCN-IR-NEXT:    s_mul_i32 s9, s9, s10
+; GCN-IR-NEXT:    s_mul_i32 s8, s8, s10
+; GCN-IR-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-IR-NEXT:    s_add_i32 s11, s12, s11
+; GCN-IR-NEXT:    s_add_i32 s11, s11, s9
+; GCN-IR-NEXT:    s_sub_u32 s2, s2, s8
+; GCN-IR-NEXT:    s_subb_u32 s3, s3, s11
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s0, s2, s0
+; GCN-IR-NEXT:    s_subb_u32 s1, s3, s1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 31
@@ -1128,10 +1145,10 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1161,37 +1178,40 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
 ; GCN-IR-NEXT:    s_sub_u32 s6, s6, s12
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s12
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
+; GCN-IR-NEXT:    s_min_u32 s16, s8, s9
+; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
+; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    s_or_b64 s[18:19], s[10:11], s[18:19]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[18:19], exec
+; GCN-IR-NEXT:    s_cselect_b32 s11, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s10, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[20:21], s[10:11], 63
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[14:15], s[20:21]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
+; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[18:19], 0
+; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[2:3], s14
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s14
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[2:3], s18
 ; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
 ; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
 ; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
@@ -1222,34 +1242,28 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB9_3
 ; GCN-IR-NEXT:  .LBB9_4: ; %Flow3
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_branch .LBB9_6
-; GCN-IR-NEXT:  .LBB9_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  .LBB9_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-IR-NEXT:  .LBB9_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-IR-NEXT:    s_mul_i32 s8, s6, s11
+; GCN-IR-NEXT:    s_mul_i32 s7, s7, s10
+; GCN-IR-NEXT:    s_mul_i32 s6, s6, s10
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
+; GCN-IR-NEXT:    v_subb_u32_e32 v0, vcc, v0, v2, vcc
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
@@ -1309,9 +1323,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -1355,49 +1369,52 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s6, s3, 31
-; GCN-IR-NEXT:    s_mov_b32 s7, s6
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, s2, s6
-; GCN-IR-NEXT:    s_subb_u32 s5, s3, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s4
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s5
-; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[6:7], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-IR-NEXT:    s_ashr_i32 s8, s3, 31
+; GCN-IR-NEXT:    s_mov_b32 s9, s8
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, s2, s8
+; GCN-IR-NEXT:    s_subb_u32 s5, s3, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
+; GCN-IR-NEXT:    s_min_u32 s8, s2, s3
+; GCN-IR-NEXT:    s_add_u32 s2, s8, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[2:3], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[2:3], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[10:11], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s10, 0, 24
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
+; GCN-IR-NEXT:    s_add_u32 s10, s2, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s3, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], 24, s2
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
@@ -1406,46 +1423,41 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
 ; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  .LBB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s2, s14, s10
-; GCN-IR-NEXT:    s_subb_u32 s2, s15, s11
-; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
+; GCN-IR-NEXT:    s_lshr_b32 s6, s3, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s6, s14, s10
+; GCN-IR-NEXT:    s_subb_u32 s6, s15, s11
+; GCN-IR-NEXT:    s_ashr_i32 s12, s6, 31
 ; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
+; GCN-IR-NEXT:    s_and_b32 s6, s12, 1
 ; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s12
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s13
 ; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
 ; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_3
 ; GCN-IR-NEXT:  .LBB10_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_branch .LBB10_6
-; GCN-IR-NEXT:  .LBB10_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  .LBB10_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s4, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB10_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-IR-NEXT:    s_mul_i32 s6, s4, s11
+; GCN-IR-NEXT:    s_mul_i32 s5, s5, s10
+; GCN-IR-NEXT:    s_mul_i32 s4, s4, s10
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s5, v0
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, vcc, 24, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1950,55 +1962,65 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-LABEL: s_test_srem24_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[4:5], s[2:3], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 s5, 0x41c00000
-; GCN-NEXT:    s_ashr_i32 s6, s4, 30
-; GCN-NEXT:    s_or_b32 s6, s6, 1
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GCN-NEXT:    s_mov_b32 s3, 0x41c00000
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_or_b32 s8, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s5
+; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s8, 0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    s_add_i32 s0, s1, s0
+; GCN-NEXT:    s_mul_i32 s0, s0, s2
+; GCN-NEXT:    s_sub_i32 s0, 24, s0
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_k_num_i64:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[2:3], 40
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GCN-IR-NEXT:    s_mov_b32 s5, 0x41c00000
-; GCN-IR-NEXT:    s_ashr_i32 s6, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s6, s6, 1
+; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GCN-IR-NEXT:    s_mov_b32 s3, 0x41c00000
+; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_or_b32 s8, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s5
+; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s8, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-IR-NEXT:    s_add_i32 s0, s1, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s0, s2
+; GCN-IR-NEXT:    s_sub_i32 s0, 24, s0
+; GCN-IR-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
@@ -2010,58 +2032,62 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-LABEL: s_test_srem24_k_den_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s4, 0x46b6fe00
+; GCN-NEXT:    s_mov_b32 s8, 0x46b6fe00
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GCN-NEXT:    s_ashr_i32 s3, s2, 30
-; GCN-NEXT:    s_or_b32 s3, s3, 1
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mad_f32 v0, -v2, s4, v0
-; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s3, 0x5b7f
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-NEXT:    v_mad_f32 v0, -v1, s8, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-NEXT:    s_or_b32 s3, s0, 1
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s8
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s3, 0
+; GCN-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-NEXT:    s_add_i32 s0, s1, s0
+; GCN-NEXT:    s_mulk_i32 s0, 0x5b7f
+; GCN-NEXT:    s_sub_i32 s0, s2, s0
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; GCN-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem24_k_den_i64:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b32 s4, 0x46b6fe00
+; GCN-IR-NEXT:    s_mov_b32 s8, 0x46b6fe00
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GCN-IR-NEXT:    s_ashr_i32 s3, s2, 30
-; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
-; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mad_f32 v0, -v2, s4, v0
-; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GCN-IR-NEXT:    s_movk_i32 s3, 0x5b7f
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
+; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s8, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GCN-IR-NEXT:    s_or_b32 s3, s0, 1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s8
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s3, 0
+; GCN-IR-NEXT:    v_readfirstlane_b32 s1, v1
+; GCN-IR-NEXT:    s_add_i32 s0, s1, s0
+; GCN-IR-NEXT:    s_mulk_i32 s0, 0x5b7f
+; GCN-IR-NEXT:    s_sub_i32 s0, s2, s0
+; GCN-IR-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; GCN-IR-NEXT:    s_ashr_i32 s1, s0, 31
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index 922d9e5f4eaa4..f5401a1a254db 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -96,9 +96,7 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a)
 ; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
 ; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_bitcmp1_b32 s[[SLO]], 0
-; SI: s_cselect_b64 s[[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s[[[VLO]]:[[VHI]]]
-; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12
+; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 0c9bf2ac2f768..889c3fc8b9d0c 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -31,11 +31,11 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v3, v0
-; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -67,11 +67,11 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v3, v0
-; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
 ; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v1, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v2
 ; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -99,11 +99,11 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_mul_lo_u32 v5, v4, v1
 ; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
-; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v5, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
+; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
 ; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[0:1]
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
@@ -117,25 +117,29 @@ define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, 0, v1
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX1030-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX1030-NEXT:    v_mul_lo_u32 v4, v4, v3
-; GFX1030-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX1030-NEXT:    v_add_nc_u32_e32 v3, v3, v4
-; GFX1030-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX1030-NEXT:    v_mul_lo_u32 v4, v3, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v4
-; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v0, v1
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v3
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX1030-NEXT:    s_sub_i32 s4, 0, s2
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1030-NEXT:    s_mul_i32 s4, s4, s3
+; GFX1030-NEXT:    s_mul_hi_u32 s4, s3, s4
+; GFX1030-NEXT:    s_add_i32 s3, s3, s4
+; GFX1030-NEXT:    s_mul_hi_u32 s3, s5, s3
+; GFX1030-NEXT:    s_mul_i32 s4, s3, s2
+; GFX1030-NEXT:    s_sub_i32 s4, s5, s4
+; GFX1030-NEXT:    s_add_i32 s5, s3, 1
+; GFX1030-NEXT:    s_sub_i32 s6, s4, s2
+; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX1030-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX1030-NEXT:    s_add_i32 s5, s3, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX1030-NEXT:    s_cselect_b32 s2, s5, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1030-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -194,18 +198,21 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; SI-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; SI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; SI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    s_mul_i32 s0, s0, s3
+; SI-NEXT:    s_sub_i32 s0, s2, s0
+; SI-NEXT:    s_sub_i32 s1, s0, s3
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; SI-NEXT:    s_cmp_ge_u32 s0, s3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    s_cselect_b32 s0, s1, s0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; SI-NEXT:    s_cmp_ge_u32 s0, s3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -226,51 +233,56 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; VI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; VI-NEXT:    v_mul_lo_u32 v1, v0, s3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; VI-NEXT:    v_sub_u32_e32 v1, vcc, s2, v1
-; VI-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
-; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, s3, v1
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; VI-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    s_mul_i32 s0, s0, s3
+; VI-NEXT:    s_sub_i32 s0, s2, s0
+; VI-NEXT:    s_sub_i32 s1, s0, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; VI-NEXT:    s_cmp_ge_u32 s0, s3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GCN-LABEL: s_udiv_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GCN-NEXT:    s_sub_i32 s0, 0, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GCN-NEXT:    s_sub_i32 s4, 0, s3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s7
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_sub_u32_e32 v1, vcc, s6, v1
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    v_readfirstlane_b32 s4, v0
+; GCN-NEXT:    s_mul_i32 s4, s4, s3
+; GCN-NEXT:    s_sub_i32 s2, s2, s4
+; GCN-NEXT:    s_sub_i32 s4, s2, s3
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s2, s3
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s2, s4, s2
+; GCN-NEXT:    v_add_u32_e32 v1, vcc, 1, v0
+; GCN-NEXT:    s_cmp_ge_u32 s2, s3
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: s_udiv_i32:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX1030-NEXT:    s_sub_i32 s5, 0, s3
@@ -278,24 +290,23 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1030-NEXT:    s_mul_i32 s5, s5, s4
 ; GFX1030-NEXT:    s_mul_hi_u32 s5, s4, s5
 ; GFX1030-NEXT:    s_add_i32 s4, s4, s5
 ; GFX1030-NEXT:    s_mul_hi_u32 s4, s2, s4
 ; GFX1030-NEXT:    s_mul_i32 s5, s4, s3
 ; GFX1030-NEXT:    s_sub_i32 s2, s2, s5
+; GFX1030-NEXT:    s_add_i32 s5, s4, 1
+; GFX1030-NEXT:    s_sub_i32 s6, s2, s3
 ; GFX1030-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX1030-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1030-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX1030-NEXT:    s_cselect_b32 s2, s6, s2
 ; GFX1030-NEXT:    s_add_i32 s5, s4, 1
-; GFX1030-NEXT:    v_mov_b32_e32 v0, s5
-; GFX1030-NEXT:    s_sub_i32 s5, s2, s3
-; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
-; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
-; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX1030-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX1030-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX1030-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_udiv_i32:
@@ -362,21 +373,21 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; SI-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; SI-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; SI-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; SI-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; SI-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
+; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
-; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
-; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
 ; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
+; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -417,21 +428,21 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; VI-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; VI-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; VI-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
 ; VI-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; VI-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
-; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, v1, v8
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
-; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
 ; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
+; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
@@ -469,20 +480,20 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ; GCN-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GCN-NEXT:    v_add_u32_e32 v6, vcc, v9, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v6
-; GCN-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
 ; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
 ; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
-; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_sub_u32_e32 v1, vcc, v1, v10
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GCN-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
 ; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
 ; GCN-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[2:3]
@@ -496,50 +507,58 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad
 ;
 ; GFX1030-LABEL: udiv_v2i32:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, 0, v2
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v8, 0, v3
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX1030-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GFX1030-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX1030-NEXT:    v_mul_lo_u32 v7, v7, v5
-; GFX1030-NEXT:    v_mul_lo_u32 v8, v8, v6
-; GFX1030-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX1030-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX1030-NEXT:    v_add_nc_u32_e32 v5, v5, v7
-; GFX1030-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX1030-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GFX1030-NEXT:    v_mul_hi_u32 v6, v1, v6
-; GFX1030-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX1030-NEXT:    v_mul_lo_u32 v8, v6, v3
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
-; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, v1, v8
-; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v9, v1, v3
-; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX1030-NEXT:    v_add_nc_u32_e32 v7, 1, v5
-; GFX1030-NEXT:    v_add_nc_u32_e32 v8, 1, v6
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc_lo
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v1, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v8, vcc_lo
-; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1030-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX1030-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, s3
+; GFX1030-NEXT:    s_sub_i32 s5, 0, s2
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v3
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1030-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX1030-NEXT:    s_mul_i32 s5, s5, s4
+; GFX1030-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX1030-NEXT:    s_add_i32 s4, s4, s5
+; GFX1030-NEXT:    s_mul_hi_u32 s4, s6, s4
+; GFX1030-NEXT:    s_mul_i32 s5, s4, s2
+; GFX1030-NEXT:    s_sub_i32 s5, s6, s5
+; GFX1030-NEXT:    s_add_i32 s6, s4, 1
+; GFX1030-NEXT:    s_sub_i32 s7, s5, s2
+; GFX1030-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT:    s_add_i32 s6, s4, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1030-NEXT:    s_cselect_b32 s2, s6, s4
+; GFX1030-NEXT:    s_sub_i32 s4, 0, s3
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    s_mul_i32 s4, s4, s8
+; GFX1030-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX1030-NEXT:    s_add_i32 s8, s8, s4
+; GFX1030-NEXT:    s_mul_hi_u32 s4, s5, s8
+; GFX1030-NEXT:    s_mul_i32 s6, s4, s3
+; GFX1030-NEXT:    s_sub_i32 s5, s5, s6
+; GFX1030-NEXT:    s_add_i32 s6, s4, 1
+; GFX1030-NEXT:    s_sub_i32 s7, s5, s3
+; GFX1030-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX1030-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT:    s_add_i32 s6, s4, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX1030-NEXT:    s_cselect_b32 s3, s6, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; EG-LABEL: udiv_v2i32:
@@ -641,9 +660,9 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    v_mul_hi_u32 v11, v10, v11
 ; SI-NEXT:    v_mul_hi_u32 v13, v12, v13
 ; SI-NEXT:    v_mul_hi_u32 v15, v14, v15
-; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, v11, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v12
+; SI-NEXT:    v_add_i32_e32 v10, vcc, v12, v13
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, v15, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_hi_u32 v8, v4, v8
@@ -654,7 +673,7 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    v_mul_lo_u32 v14, v9, v1
 ; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
 ; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
-; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
 ; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
 ; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
 ; SI-NEXT:    v_sub_i32_e32 v7, vcc, v7, v18
@@ -666,14 +685,14 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
 ; SI-NEXT:    v_subrev_i32_e32 v12, vcc, v0, v4
-; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
 ; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
 ; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
+; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
+; SI-NEXT:    v_subrev_i32_e32 v15, vcc, v3, v7
 ; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
-; SI-NEXT:    v_sub_i32_e32 v15, vcc, v7, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
@@ -736,8 +755,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; VI-NEXT:    v_mul_hi_u32 v11, v10, v11
 ; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
 ; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -750,8 +769,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
 ; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
-; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
-; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v14
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v6, v16
 ; VI-NEXT:    v_sub_u32_e32 v7, vcc, v7, v18
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
@@ -761,14 +780,14 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
-; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
 ; VI-NEXT:    v_subrev_u32_e32 v12, vcc, v0, v4
-; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e64 v8, v8, v13, s[0:1]
 ; VI-NEXT:    v_subrev_u32_e32 v13, vcc, v1, v5
-; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
 ; VI-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
-; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
+; VI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
 ; VI-NEXT:    v_subrev_u32_e32 v15, vcc, v3, v7
+; VI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
 ; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
 ; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
@@ -831,8 +850,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mul_hi_u32 v13, v12, v13
 ; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
 ; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
-; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_add_u32_e32 v11, vcc, v12, v13
+; GCN-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
+; GCN-NEXT:    v_add_u32_e32 v11, vcc, v13, v12
 ; GCN-NEXT:    v_add_u32_e32 v12, vcc, v14, v15
 ; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -845,8 +864,8 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
 ; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
 ; GCN-NEXT:    v_sub_u32_e32 v4, vcc, v4, v14
-; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
-; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
+; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v5, v16
+; GCN-NEXT:    v_sub_u32_e32 v6, vcc, v6, v18
 ; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v19
 ; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
 ; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
@@ -856,120 +875,136 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[6:7], v7, v3
+; GCN-NEXT:    v_subrev_u32_e32 v18, vcc, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, v15, s[0:1]
-; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v0, v4
+; GCN-NEXT:    v_subrev_u32_e32 v15, vcc, v1, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v11, v11, v17, s[2:3]
-; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v1, v5
+; GCN-NEXT:    v_subrev_u32_e32 v17, vcc, v2, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v2, v6
+; GCN-NEXT:    v_subrev_u32_e32 v14, vcc, v3, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v16, s[6:7]
-; GCN-NEXT:    v_subrev_u32_e32 v16, vcc, v3, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v15, s[0:1]
-; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v17, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v14, s[4:5]
-; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
-; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v16, s[6:7]
-; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v18, s[0:1]
+; GCN-NEXT:    v_add_u32_e32 v16, vcc, 1, v10
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s[2:3]
+; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v11
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v17, s[4:5]
+; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v14, s[6:7]
+; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v13
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v15, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v10, v16, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v17, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v11, v15, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v17, vcc
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
 ; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX1030-LABEL: udiv_v4i32:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    s_clause 0x1
-; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
-; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[6:7]
+; GFX1030-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
+; GFX1030-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(1)
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v9, v0
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v10, v1
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v11, v2
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v12, v3
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, 0, v0
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v11, v11
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, 0, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, 0, v2
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, 0, v3
-; GFX1030-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GFX1030-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
-; GFX1030-NEXT:    v_mul_f32_e32 v11, 0x4f7ffffe, v11
-; GFX1030-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX1030-NEXT:    v_mul_lo_u32 v13, v13, v9
-; GFX1030-NEXT:    v_mul_lo_u32 v14, v14, v10
-; GFX1030-NEXT:    v_mul_lo_u32 v15, v15, v11
-; GFX1030-NEXT:    v_mul_lo_u32 v16, v16, v12
-; GFX1030-NEXT:    v_mul_hi_u32 v13, v9, v13
-; GFX1030-NEXT:    v_mul_hi_u32 v14, v10, v14
-; GFX1030-NEXT:    v_mul_hi_u32 v15, v11, v15
-; GFX1030-NEXT:    v_mul_hi_u32 v16, v12, v16
-; GFX1030-NEXT:    v_add_nc_u32_e32 v9, v9, v13
-; GFX1030-NEXT:    v_add_nc_u32_e32 v10, v10, v14
-; GFX1030-NEXT:    v_add_nc_u32_e32 v11, v11, v15
-; GFX1030-NEXT:    v_add_nc_u32_e32 v12, v12, v16
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GFX1030-NEXT:    v_mul_hi_u32 v10, v5, v10
-; GFX1030-NEXT:    v_mul_hi_u32 v11, v6, v11
-; GFX1030-NEXT:    v_mul_hi_u32 v12, v7, v12
-; GFX1030-NEXT:    v_mul_lo_u32 v13, v9, v0
-; GFX1030-NEXT:    v_mul_lo_u32 v14, v10, v1
-; GFX1030-NEXT:    v_mul_lo_u32 v15, v11, v2
-; GFX1030-NEXT:    v_mul_lo_u32 v16, v12, v3
-; GFX1030-NEXT:    v_add_nc_u32_e32 v17, 1, v9
-; GFX1030-NEXT:    v_add_nc_u32_e32 v18, 1, v10
-; GFX1030-NEXT:    v_add_nc_u32_e32 v19, 1, v11
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v4, v4, v13
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v5, v14
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v6, v6, v15
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v7, v7, v16
-; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v14, v4, v0
-; GFX1030-NEXT:    v_cmp_ge_u32_e64 s0, v5, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v15, v5, v1
-; GFX1030-NEXT:    v_cmp_ge_u32_e64 s1, v6, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e64 v10, v10, v18, s0
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v16, v6, v2
-; GFX1030-NEXT:    v_cmp_ge_u32_e64 s2, v7, v3
-; GFX1030-NEXT:    v_add_nc_u32_e32 v14, 1, v9
-; GFX1030-NEXT:    v_cndmask_b32_e64 v5, v5, v15, s0
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v4, v0
-; GFX1030-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s1
-; GFX1030-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s2
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v13, v7, v3
-; GFX1030-NEXT:    v_add_nc_u32_e32 v15, 1, v10
-; GFX1030-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v9, v14, vcc_lo
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v5, v1
-; GFX1030-NEXT:    v_add_nc_u32_e32 v16, 1, v11
-; GFX1030-NEXT:    v_cndmask_b32_e64 v7, v7, v13, s2
-; GFX1030-NEXT:    v_add_nc_u32_e32 v13, 1, v12
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc_lo
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v6, v2
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v11, v16, vcc_lo
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v7, v3
-; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc_lo
-; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v4
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX1030-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1030-NEXT:    s_mul_i32 s6, s6, s4
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s4, s6
+; GFX1030-NEXT:    s_add_i32 s4, s4, s6
+; GFX1030-NEXT:    s_mul_hi_u32 s4, s7, s4
+; GFX1030-NEXT:    s_mul_i32 s6, s4, s2
+; GFX1030-NEXT:    s_sub_i32 s6, s7, s6
+; GFX1030-NEXT:    s_add_i32 s7, s4, 1
+; GFX1030-NEXT:    s_sub_i32 s8, s6, s2
+; GFX1030-NEXT:    s_cmp_ge_u32 s6, s2
+; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX1030-NEXT:    s_add_i32 s7, s4, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s6, s2
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1030-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT:    s_sub_i32 s6, 0, s3
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX1030-NEXT:    s_mul_i32 s6, s6, s9
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s9, s6
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX1030-NEXT:    s_add_i32 s9, s9, s6
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s9
+; GFX1030-NEXT:    v_readfirstlane_b32 s10, v0
+; GFX1030-NEXT:    s_mul_i32 s8, s6, s3
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_sub_i32 s9, s7, s3
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v6
+; GFX1030-NEXT:    s_cselect_b32 s3, s8, s6
+; GFX1030-NEXT:    s_sub_i32 s6, 0, s5
+; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
+; GFX1030-NEXT:    s_mul_i32 s6, s6, s10
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s10, s6
+; GFX1030-NEXT:    s_add_i32 s10, s10, s6
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s10
+; GFX1030-NEXT:    s_mul_i32 s8, s6, s5
+; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_sub_i32 s9, s7, s5
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s5
+; GFX1030-NEXT:    v_readfirstlane_b32 s10, v0
+; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s5
+; GFX1030-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX1030-NEXT:    s_cselect_b32 s5, s8, s6
+; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    s_mul_i32 s6, s6, s10
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s5
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s10, s6
+; GFX1030-NEXT:    s_add_i32 s10, s10, s6
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s10
+; GFX1030-NEXT:    s_mul_i32 s8, s6, s2
+; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_sub_i32 s9, s7, s2
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s2
+; GFX1030-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX1030-NEXT:    s_cselect_b32 s7, s9, s7
+; GFX1030-NEXT:    s_add_i32 s8, s6, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s7, s2
+; GFX1030-NEXT:    s_cselect_b32 s2, s8, s6
+; GFX1030-NEXT:    v_mov_b32_e32 v3, s2
+; GFX1030-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GFX1030-NEXT:    s_endpgm
 ;
 ; EG-LABEL: udiv_v4i32:
@@ -1855,11 +1890,11 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
 ; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
-; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
-; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
-; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
+; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
@@ -1903,10 +1938,10 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
-; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
-; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; VI-NEXT:    v_sub_u32_e32 v3, vcc, v2, v0
-; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
 ; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
@@ -1958,10 +1993,10 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
 ; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
 ; GCN-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v2, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v3, v2
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
@@ -1980,32 +2015,39 @@ define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; GFX1030-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
 ; GFX1030-NEXT:    global_load_ushort v4, v0, s[2:3]
 ; GFX1030-NEXT:    s_waitcnt vmcnt(3)
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1030-NEXT:    s_waitcnt vmcnt(2)
+; GFX1030-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX1030-NEXT:    s_waitcnt vmcnt(1)
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX1030-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, 0, v1
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX1030-NEXT:    v_mul_lo_u32 v5, v5, v2
-; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX1030-NEXT:    v_add_nc_u32_e32 v2, v2, v5
-; GFX1030-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX1030-NEXT:    v_mul_lo_u32 v4, v2, v1
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
-; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v5, v3, v1
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX1030-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX1030-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v3, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX1030-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1030-NEXT:    s_or_b32 s2, s3, s2
+; GFX1030-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
+; GFX1030-NEXT:    s_or_b32 s4, s5, s4
+; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1030-NEXT:    s_mul_i32 s6, s6, s3
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s3, s6
+; GFX1030-NEXT:    s_add_i32 s3, s3, s6
+; GFX1030-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX1030-NEXT:    s_mul_i32 s5, s3, s2
+; GFX1030-NEXT:    s_sub_i32 s4, s4, s5
+; GFX1030-NEXT:    s_add_i32 s5, s3, 1
+; GFX1030-NEXT:    s_sub_i32 s6, s4, s2
+; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX1030-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX1030-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX1030-NEXT:    s_add_i32 s5, s3, 1
+; GFX1030-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX1030-NEXT:    s_cselect_b32 s2, s5, s3
+; GFX1030-NEXT:    s_and_b32 s2, s2, 0xffffff
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1030-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX1030-NEXT:    s_endpgm
 ;
@@ -2352,7 +2394,7 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon
 ; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -2378,7 +2420,7 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon
 ; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -2402,7 +2444,7 @@ define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readon
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 ;
@@ -2522,7 +2564,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_mul_lo_u32 v5, v3, s4
 ; SI-NEXT:    v_mul_lo_u32 v6, v2, s4
 ; SI-NEXT:    s_mov_b32 s4, 0x186a0
-; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
+; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; SI-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; SI-NEXT:    v_mul_hi_u32 v7, v2, v6
@@ -2598,8 +2640,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
-; VI-NEXT:    v_add_u32_e32 v8, vcc, v3, v4
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
@@ -2615,7 +2657,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; VI-NEXT:    v_mul_hi_u32 v8, v6, v2
@@ -2685,8 +2727,8 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
-; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_add_u32_e32 v8, vcc, v3, v4
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
 ; GCN-NEXT:    v_add_u32_e32 v9, vcc, v5, v3
@@ -2702,7 +2744,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
-; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_add_u32_e32 v5, vcc, v3, v4
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
 ; GCN-NEXT:    v_mul_hi_u32 v8, v6, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 1fe35d7da8405..9c99ef27a0748 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -48,9 +48,9 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s5, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -100,12 +100,13 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
@@ -115,9 +116,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -125,38 +125,41 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s4
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s5
-; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[8:9], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[12:13], s[18:19]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[16:17]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
+; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
 ; GCN-IR-NEXT:    s_add_u32 s15, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s16, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
@@ -187,18 +190,12 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_branch .LBB0_6
-; GCN-IR-NEXT:  .LBB0_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[2:3]
+; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, %y
@@ -671,36 +668,36 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_udiv24_i48:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s4, s4, 0xff000000
-; GCN-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_alignbit_b32 v0, s5, v0, 24
+; GCN-NEXT:    s_and_b32 s0, s2, 0xff000000
+; GCN-NEXT:    s_and_b32 s1, s3, 0xffff
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_alignbit_b32 v0, s1, v0, 24
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    s_and_b32 s8, s3, 0xffff
-; GCN-NEXT:    s_and_b32 s9, s2, 0xff000000
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[4:5], 24
+; GCN-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NEXT:    s_and_b32 s6, s6, 0xff000000
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 24
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-NEXT:    s_sub_u32 s2, 0, s2
-; GCN-NEXT:    s_subb_u32 s3, 0, s3
-; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_sub_u32 s8, 0, s0
+; GCN-NEXT:    s_subb_u32 s9, 0, s1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v1
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v1
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
+; GCN-NEXT:    v_mul_lo_u32 v6, s8, v1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
@@ -719,11 +716,11 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v4, s8, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v1, v4
@@ -740,9 +737,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GCN-NEXT:    v_alignbit_b32 v3, s8, v3, 24
+; GCN-NEXT:    v_alignbit_b32 v3, s7, v3, 24
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, v3, v2
@@ -756,12 +753,12 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v1
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
 ; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v3, v0
@@ -769,25 +766,26 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; GCN-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_and_b32 s3, s7, 0xffff
 ; GCN-IR-NEXT:    s_and_b32 s2, s6, 0xff000000
@@ -797,26 +795,28 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_lshr_b64 s[0:1], s[0:1], 24
 ; GCN-IR-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GCN-IR-NEXT:    s_and_b32 s1, s1, 0xffff
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[0:1], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s0
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s1
-; GCN-IR-NEXT:    s_min_u32 s10, s6, s7
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
-; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[0:1], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s0
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s1
+; GCN-IR-NEXT:    s_min_u32 s10, s2, s3
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s8
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s9
+; GCN-IR-NEXT:    s_min_u32 s14, s2, s3
 ; GCN-IR-NEXT:    s_sub_u32 s6, s10, s14
 ; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[6:7], 63
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[12:13], s[18:19]
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[12:13], s[16:17]
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s13, 0, s9
+; GCN-IR-NEXT:    s_cselect_b32 s12, 0, s8
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
@@ -858,19 +858,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
 ; GCN-IR-NEXT:  .LBB7_4: ; %Flow3
 ; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-IR-NEXT:    s_branch .LBB7_6
-; GCN-IR-NEXT:  .LBB7_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  .LBB7_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[2:3], s[0:1]
+; GCN-IR-NEXT:  .LBB7_5: ; %udiv-end
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
@@ -966,54 +961,57 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v4, s[0:1], 2, v0
+; GCN-NEXT:    v_add_i32_e64 v4, s[0:1], 1, v0
 ; GCN-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1]
-; GCN-NEXT:    v_add_i32_e64 v6, s[0:1], 1, v0
+; GCN-NEXT:    v_add_i32_e64 v6, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1]
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[6:7], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
+; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s9
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
+; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s12
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
 ; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
@@ -1043,16 +1041,12 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_3
 ; GCN-IR-NEXT:  .LBB8_4: ; %Flow5
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_branch .LBB8_6
-; GCN-IR-NEXT:  .LBB8_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[2:3]
+; GCN-IR-NEXT:  .LBB8_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
@@ -1335,21 +1329,22 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_movk_i32 s8, 0xffe8
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s8
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s8
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s8
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
@@ -1368,12 +1363,11 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s8
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s8
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s8
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -1390,15 +1384,15 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -1406,14 +1400,14 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, 24
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v8, v0, 24
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
+; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s6, v8
 ; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v8
 ; GCN-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -1421,17 +1415,17 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v8
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v4, -1, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv_k_den_i64:
@@ -1442,26 +1436,29 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
 ; GCN-IR-NEXT:    s_min_u32 s10, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s6, 59, s10
-; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GCN-IR-NEXT:    s_sub_u32 s8, 59, s10
+; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[8:9], s[6:7], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[8:9], s[12:13]
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[8:9], 63
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s8, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s7, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[8:9], 0
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
+; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s8
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], s12
 ; GCN-IR-NEXT:    s_add_u32 s2, s10, 0xffffffc4
 ; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
@@ -1488,18 +1485,12 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_3
 ; GCN-IR-NEXT:  .LBB11_4: ; %Flow5
 ; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_branch .LBB11_6
-; GCN-IR-NEXT:  .LBB11_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[8:9]
-; GCN-IR-NEXT:  .LBB11_6: ; %udiv-end
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[2:3]
+; GCN-IR-NEXT:  .LBB11_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 58c1445b9f303..e1a79bf1e9c7a 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -36,77 +36,84 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ;
 ; GFX6-LABEL: test_udivrem:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x26
+; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x26
+; GFX6-NEXT:    s_load_dword s9, s[0:1], 0x1d
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x13
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    s_mov_b32 s10, s6
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX6-NEXT:    s_sub_i32 s3, 0, s2
-; GFX6-NEXT:    s_mov_b32 s11, s7
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_sub_i32 s2, 0, s8
+; GFX6-NEXT:    s_mov_b32 s3, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x1d
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s3, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s10, v0
+; GFX6-NEXT:    s_mul_i32 s10, s10, s8
+; GFX6-NEXT:    s_sub_i32 s9, s9, s10
+; GFX6-NEXT:    s_sub_i32 s10, s9, s8
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s9, s8
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s9, s10, s9
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_sub_i32 s10, s9, s8
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
+; GFX6-NEXT:    s_cmp_ge_u32 s9, s8
+; GFX6-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_cselect_b32 s8, s10, s9
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, v2, s[0:1]
-; GFX6-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_udivrem:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x98
-; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x74
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
+; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x98
+; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x74
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX8-NEXT:    s_sub_i32 s2, 0, s6
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX8-NEXT:    s_sub_i32 s2, 0, s4
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x4c
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v4, s5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s7, v3
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
-; GFX8-NEXT:    flat_store_dword v[0:1], v2
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX8-NEXT:    s_mul_i32 s0, s0, s4
+; GFX8-NEXT:    s_sub_i32 s0, s5, s0
+; GFX8-NEXT:    s_sub_i32 s1, s0, s4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX8-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    s_sub_i32 s1, s0, s4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; GFX8-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX8-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX8-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
@@ -158,43 +165,47 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s6
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s6
+; GFX6-NEXT:    s_sub_i32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s6
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX6-NEXT:    s_cselect_b32 s4, s3, s2
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX6-NEXT:    s_mul_i32 s6, s6, s7
+; GFX6-NEXT:    s_sub_i32 s5, s5, s6
+; GFX6-NEXT:    s_sub_i32 s6, s5, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    s_sub_i32 s6, s5, s7
+; GFX6-NEXT:    s_cmp_ge_u32 s5, s7
+; GFX6-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -230,7 +241,7 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX8-NEXT:    v_mul_lo_u32 v0, s3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
@@ -323,77 +334,85 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_i32 s12, 0, s8
-; GFX6-NEXT:    s_sub_i32 s13, 0, s9
+; GFX6-NEXT:    s_sub_i32 s2, 0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    s_sub_i32 s4, 0, s10
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    s_sub_i32 s4, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s8
+; GFX6-NEXT:    s_sub_i32 s2, s4, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s8
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX6-NEXT:    s_cselect_b32 s4, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s9
+; GFX6-NEXT:    s_sub_i32 s2, s5, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s9
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s10
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s2, s2, s10
+; GFX6-NEXT:    s_sub_i32 s2, s6, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s10
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s10
+; GFX6-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX6-NEXT:    s_sub_i32 s3, s2, s10
+; GFX6-NEXT:    s_cmp_ge_u32 s2, s10
+; GFX6-NEXT:    s_cselect_b32 s6, s3, s2
+; GFX6-NEXT:    s_sub_i32 s2, 0, s11
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v1
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v2, s7, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX6-NEXT:    s_mul_i32 s4, s4, s11
+; GFX6-NEXT:    s_sub_i32 s4, s7, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s11
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    s_sub_i32 s5, s4, s11
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -449,7 +468,7 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index b55a429cc7a89..a7b85f62b0504 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -76,12 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)
 
 ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64:
 ; VI-DAG: s_cmp_eq_u32
-; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
-; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
-; SI-DAG: s_cmp_eq_u32
-; SI-DAG: s_cselect_b64 vcc, -1, 0
-; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc
+; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[ZERO]]:[[SEL]]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index dbec6f144b4c6..33038bb4fe238 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,7 +51,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -104,19 +104,19 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v4, s11
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -124,38 +124,41 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s4
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s5
-; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[8:9], 63
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[12:13], s[18:19]
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_min_u32 s14, s6, s7
+; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[12:13], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[16:17]
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[16:17], exec
+; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[16:17], s[12:13], 0
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
+; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[16:17], 0
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s16
 ; GCN-IR-NEXT:    s_add_u32 s16, s4, -1
 ; GCN-IR-NEXT:    s_addc_u32 s17, s5, -1
 ; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
@@ -186,30 +189,24 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_3
 ; GCN-IR-NEXT:  .LBB0_4: ; %Flow6
 ; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_branch .LBB0_6
-; GCN-IR-NEXT:  .LBB0_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  .LBB0_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s4, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
-; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
+; GCN-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-IR-NEXT:    s_mov_b32 s12, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s4, s9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s5, s8
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s4, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s10, -1
-; GCN-IR-NEXT:    s_mov_b32 s8, s0
-; GCN-IR-NEXT:    s_mov_b32 s9, s1
+; GCN-IR-NEXT:    s_mov_b32 s15, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s14, -1
+; GCN-IR-NEXT:    s_mov_b32 s13, s1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
@@ -753,9 +750,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v2
@@ -781,7 +778,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v1, s7, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
@@ -799,47 +796,50 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[6:7], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[6:7], s[12:13]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[12:13], exec
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
+; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], 24, s9
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
+; GCN-IR-NEXT:    s_lshr_b64 s[10:11], 24, s12
 ; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
 ; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
 ; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
@@ -869,27 +869,22 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_3
 ; GCN-IR-NEXT:  .LBB6_4: ; %Flow5
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-IR-NEXT:    s_branch .LBB6_6
-; GCN-IR-NEXT:  .LBB6_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  .LBB6_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
+; GCN-IR-NEXT:  .LBB6_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-IR-NEXT:    s_mov_b32 s8, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s2, s7
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s3, s6
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s2, s6
+; GCN-IR-NEXT:    v_sub_i32_e64 v0, vcc, 24, s0
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s9, s1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -902,9 +897,9 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_movk_i32 s4, 0xffe8
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_movk_i32 s2, 0xffe8
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -912,13 +907,13 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -935,12 +930,12 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -957,15 +952,15 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
@@ -975,8 +970,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
@@ -987,16 +982,16 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem_k_den_i64:
@@ -1007,26 +1002,29 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
 ; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
-; GCN-IR-NEXT:    s_sub_u32 s6, 59, s8
-; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GCN-IR-NEXT:    s_sub_u32 s10, 59, s8
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[6:7], 63
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[4:5], s[10:11]
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[10:11], s[12:13]
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[6:7], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 63
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GCN-IR-NEXT:    s_cselect_b32 s7, 0, s3
+; GCN-IR-NEXT:    s_cselect_b32 s6, 0, s2
+; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[12:13], 0
+; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s9
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s10
+; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s12
 ; GCN-IR-NEXT:    s_add_u32 s8, s8, 0xffffffc4
 ; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
@@ -1053,28 +1051,21 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_3
 ; GCN-IR-NEXT:  .LBB7_4: ; %Flow5
 ; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-IR-NEXT:    s_branch .LBB7_6
-; GCN-IR-NEXT:  .LBB7_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[10:11]
-; GCN-IR-NEXT:  .LBB7_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
+; GCN-IR-NEXT:  .LBB7_5: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_hi_u32 v0, s6, 24
+; GCN-IR-NEXT:    s_mov_b32 s8, s0
+; GCN-IR-NEXT:    s_mul_i32 s0, s7, 24
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
+; GCN-IR-NEXT:    s_mul_i32 s0, s6, 24
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    s_mov_b32 s9, s1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 6e7bdb107a979..9b1b552b82a0f 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -13,9 +13,9 @@
 ; VI: s_cselect_b32
 
 ; SI-DAG: s_cmp_gt_i32
-; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: s_cselect_b32
 ; SI-DAG: s_cmp_gt_i32
-; SI-DAG: v_cndmask_b32_e32
+; SI-DAG: s_cselect_b32
 
 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
@@ -59,10 +59,10 @@ entry:
 ; VI: s_cselect_b32
 ; VI: s_cselect_b32
 
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e64
-; SI-DAG: v_cndmask_b32_e32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
+; SI-DAG: s_cselect_b32
 
 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:


        


More information about the llvm-commits mailing list