[llvm] 67d0f18 - [AMDGPU] Delete redundant s_or_b32 (#165261)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 7 07:27:24 PST 2025


Author: LU-JOHN
Date: 2025-11-07T09:27:20-06:00
New Revision: 67d0f181f469ee7aa1c9c99bf8c66ec664b5c085

URL: https://github.com/llvm/llvm-project/commit/67d0f181f469ee7aa1c9c99bf8c66ec664b5c085
DIFF: https://github.com/llvm/llvm-project/commit/67d0f181f469ee7aa1c9c99bf8c66ec664b5c085.diff

LOG: [AMDGPU] Delete redundant s_or_b32 (#165261)

Transform sequences like:

```
s_cselect_b64 s[12:13], -1, 0
s_or_b32 s6, s12, s13
```

where s6 is dead to: 

`s_cselect_b64 s[12:13], -1, 0`

---------

Signed-off-by: John Lu <John.Lu at amd.com>

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/carryout-selection.ll
    llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
    llvm/test/CodeGen/AMDGPU/optimize-compare.mir
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/uaddo.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/usubo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6ce18ea921a9b..9c74c654d8e35 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10163,7 +10163,7 @@ static bool followSubRegDef(MachineInstr &MI,
 }
 
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                                     MachineRegisterInfo &MRI) {
+                                     const MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
   if (!P.Reg.isVirtual())
     return nullptr;
@@ -10628,6 +10628,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
 static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
                         const SIRegisterInfo &RI) {
   MachineInstr *KillsSCC = nullptr;
+  if (SCCValid->getParent() != SCCRedefine->getParent())
+    return false;
   for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
                                      SCCRedefine->getIterator())) {
     if (MI.modifiesRegister(AMDGPU::SCC, &RI))
@@ -10672,8 +10674,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (CmpValue != 0)
       return false;
 
-    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
-    if (!Def || Def->getParent() != CmpInstr.getParent())
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
       return false;
 
     // For S_OP that set SCC = DST!=0, do the transformation
@@ -10692,6 +10694,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (!optimizeSCC(Def, &CmpInstr, RI))
       return false;
 
+    // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+    // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+    // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+    //    sX = s_cselect_b64 (non-zero imm), 0
+    //    sLo = copy sX.sub0
+    //    sHi = copy sX.sub1
+    //    sY = s_or_b32 sLo, sHi
+    if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+        MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+      const MachineOperand &OrOpnd1 = Def->getOperand(1);
+      const MachineOperand &OrOpnd2 = Def->getOperand(2);
+      if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+        MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+        MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+        if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+            Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+            Def2->getOperand(1).isReg() &&
+            Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+            Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+            Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+          MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+          if (Select && foldableSelect(*Select))
+            optimizeSCC(Select, Def, RI);
+        }
+      }
+    }
     return true;
   };
 
@@ -10721,8 +10749,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
     // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
 
-    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
-    if (!Def || Def->getParent() != CmpInstr.getParent())
+    MachineInstr *Def = MRI->getVRegDef(SrcReg);
+    if (!Def)
       return false;
 
     if (Def->getOpcode() != AMDGPU::S_AND_B32 &&

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 0643b532ea04c..8d693b1b19dcd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
 /// skipping copy like instructions and subreg-manipulation pseudos.
 /// Following another subreg of a reg:subreg isn't supported.
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                               MachineRegisterInfo &MRI);
+                               const MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c34cc55e..54b1554ae5d04 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
@@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT:    s_sub_u32 s12, 0, s10
-; GFX6-NEXT:    s_subb_u32 s13, 0, s11
+; GFX6-NEXT:    s_sub_u32 s0, 0, s10
+; GFX6-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s0
-; GFX6-NEXT:    s_mul_i32 s16, s12, s0
-; GFX6-NEXT:    s_add_i32 s1, s17, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s1, s1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s17, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s15, s15, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s18, v4
-; GFX6-NEXT:    s_add_u32 s15, s15, s16
-; GFX6-NEXT:    s_addc_u32 s15, s17, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_add_u32 s1, s15, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s0, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
-; GFX6-NEXT:    s_add_i32 s0, s0, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s1, s15, s12
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s0, s14, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s15, s16, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s13, s0, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s14, s1, s2
+; GFX6-NEXT:    s_mul_i32 s15, s0, s2
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GFX6-NEXT:    s_add_i32 s13, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_mul_i32 s16, s2, s13
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s14, s14, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s15, s12, s15
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
+; GFX6-NEXT:    s_add_u32 s14, s14, s15
+; GFX6-NEXT:    s_addc_u32 s14, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
+; GFX6-NEXT:    s_addc_u32 s15, s15, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s13, s14, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s15
+; GFX6-NEXT:    s_add_u32 s13, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s14
+; GFX6-NEXT:    s_mul_i32 s14, s0, s12
+; GFX6-NEXT:    s_mul_i32 s1, s1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s0, s0, s13
+; GFX6-NEXT:    s_add_i32 s1, s14, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s0, s12, s0
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s0, s15, s0
+; GFX6-NEXT:    s_addc_u32 s0, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s14
+; GFX6-NEXT:    s_add_u32 s14, s13, s0
+; GFX6-NEXT:    s_addc_u32 s15, s12, s1
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s12
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s14
+; GFX6-NEXT:    s_mul_i32 s1, s6, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s16, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s15, s7, s15
+; GFX6-NEXT:    s_mul_i32 s14, s7, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s15
+; GFX6-NEXT:    s_add_u32 s1, s1, s14
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s16, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s7, s15
+; GFX6-NEXT:    s_add_u32 s14, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s4
+; GFX6-NEXT:    s_addc_u32 s15, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s17
+; GFX6-NEXT:    s_mul_i32 s4, s10, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s16
-; GFX6-NEXT:    s_add_i32 s18, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s18
-; GFX6-NEXT:    s_mul_i32 s4, s10, s16
+; GFX6-NEXT:    s_mul_i32 s5, s11, s14
+; GFX6-NEXT:    s_add_i32 s16, s4, s5
+; GFX6-NEXT:    s_sub_i32 s17, s7, s16
+; GFX6-NEXT:    s_mul_i32 s4, s10, s14
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s4, s5
-; GFX6-NEXT:    s_subb_u32 s19, s14, s11
-; GFX6-NEXT:    s_sub_u32 s20, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, s11
+; GFX6-NEXT:    s_sub_u32 s18, s6, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s11
 ; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s20, s16, 2
-; GFX6-NEXT:    s_addc_u32 s21, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s10
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s11
+; GFX6-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX6-NEXT:    s_add_u32 s18, s14, 1
+; GFX6-NEXT:    s_addc_u32 s19, s15, 0
+; GFX6-NEXT:    s_add_u32 s20, s14, 2
+; GFX6-NEXT:    s_addc_u32 s21, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX6-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_subb_u32 s4, s7, s18
+; GFX6-NEXT:    s_subb_u32 s4, s7, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
@@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
+; GFX6-NEXT:    s_cselect_b32 s5, s18, s15
+; GFX6-NEXT:    s_cselect_b32 s4, s17, s14
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s14, 0, s6
-; GFX6-NEXT:    s_subb_u32 s15, 0, s7
+; GFX6-NEXT:    s_sub_u32 s12, 0, s6
+; GFX6-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s13, s14, s16
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_mul_i32 s17, s15, s12
-; GFX6-NEXT:    s_mul_i32 s18, s14, s12
-; GFX6-NEXT:    s_add_i32 s13, s19, s13
+; GFX6-NEXT:    s_mul_i32 s17, s13, s15
+; GFX6-NEXT:    s_mul_i32 s18, s12, s15
+; GFX6-NEXT:    s_add_i32 s16, s19, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s18
-; GFX6-NEXT:    s_add_i32 s13, s13, s17
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    s_add_i32 s16, s16, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
-; GFX6-NEXT:    s_mul_i32 s20, s12, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_mul_i32 s20, s15, s16
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s16
 ; GFX6-NEXT:    s_add_u32 s17, s17, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s20, v0
-; GFX6-NEXT:    s_mul_i32 s18, s16, s18
+; GFX6-NEXT:    s_mul_i32 s18, s14, s18
 ; GFX6-NEXT:    s_addc_u32 s20, 0, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v4
 ; GFX6-NEXT:    s_add_u32 s17, s17, s18
 ; GFX6-NEXT:    s_addc_u32 s17, s20, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v1
 ; GFX6-NEXT:    s_addc_u32 s18, s18, 0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_add_u32 s16, s17, s16
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
-; GFX6-NEXT:    s_add_u32 s18, s12, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s16, s16, s17
-; GFX6-NEXT:    s_mul_i32 s12, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s15, s15, s18
-; GFX6-NEXT:    s_mul_i32 s13, s14, s18
-; GFX6-NEXT:    s_add_i32 s12, s12, s15
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mul_hi_u32 v3, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s18, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s18, v0
-; GFX6-NEXT:    s_mul_i32 s15, s18, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_add_u32 s15, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s15, s16
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_addc_u32 s14, s14, s17
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
+; GFX6-NEXT:    s_mul_i32 s13, s13, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    s_addc_u32 s13, s17, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s12, s16, s12
-; GFX6-NEXT:    s_add_u32 s12, s13, s12
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_add_u32 s15, s18, s12
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s14, s16, s14
+; GFX6-NEXT:    s_add_i32 s16, s17, s16
+; GFX6-NEXT:    s_mul_i32 s12, s12, s15
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX6-NEXT:    s_mul_i32 s17, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX6-NEXT:    s_add_u32 s17, s19, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
+; GFX6-NEXT:    s_mul_i32 s12, s14, s12
+; GFX6-NEXT:    s_addc_u32 s18, 0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    s_addc_u32 s12, s18, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s14, s13
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s15, s15, s12
+; GFX6-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
@@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s18, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    s_add_u32 s17, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s17
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s19
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s18
-; GFX6-NEXT:    s_add_i32 s20, s14, s15
-; GFX6-NEXT:    s_sub_i32 s16, s9, s20
-; GFX6-NEXT:    s_mul_i32 s14, s6, s18
+; GFX6-NEXT:    s_mul_i32 s15, s7, s17
+; GFX6-NEXT:    s_add_i32 s18, s14, s15
+; GFX6-NEXT:    s_sub_i32 s19, s9, s18
+; GFX6-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s14, s15
-; GFX6-NEXT:    s_subb_u32 s21, s16, s7
-; GFX6-NEXT:    s_sub_u32 s22, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s16, s17
-; GFX6-NEXT:    s_subb_u32 s16, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, s7
+; GFX6-NEXT:    s_sub_u32 s20, s8, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX6-NEXT:    s_add_u32 s17, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s17, 1
+; GFX6-NEXT:    s_addc_u32 s21, s16, 0
+; GFX6-NEXT:    s_add_u32 s22, s17, 2
+; GFX6-NEXT:    s_addc_u32 s23, s16, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s9, s9, s20
+; GFX6-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX6-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX6-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s16, s6, s2
-; GFX6-NEXT:    s_subb_u32 s17, s7, s3
+; GFX6-NEXT:    s_sub_u32 s14, s6, s2
+; GFX6-NEXT:    s_subb_u32 s15, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s12, 0, s8
-; GFX6-NEXT:    s_subb_u32 s13, 0, s9
+; GFX6-NEXT:    s_sub_u32 s2, 0, s8
+; GFX6-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s13, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s12, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s13, s2, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    s_add_i32 s13, s13, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s16, s0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s18, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s16, s18, s16
+; GFX6-NEXT:    s_addc_u32 s17, 0, s17
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
-; GFX6-NEXT:    s_addc_u32 s4, s5, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
-; GFX6-NEXT:    s_mul_i32 s2, s12, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s13, s13, s5
-; GFX6-NEXT:    s_mul_i32 s3, s12, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s16, s1
+; GFX6-NEXT:    s_addc_u32 s1, s17, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s16, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s12, s13
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s5, s12, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s16
+; GFX6-NEXT:    s_mul_i32 s2, s2, s16
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT:    s_mul_i32 s12, s16, s3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s12, s4, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s12, s2
+; GFX6-NEXT:    s_addc_u32 s2, s13, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s16, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s2, s10, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s15, s2
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
-; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s14, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_add_u32 s2, s17, s2
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s18, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    s_add_u32 s2, s2, s12
+; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s13
+; GFX6-NEXT:    s_add_u32 s16, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s19
+; GFX6-NEXT:    s_addc_u32 s17, 0, s12
+; GFX6-NEXT:    s_mul_i32 s12, s8, s17
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s18
-; GFX6-NEXT:    s_add_i32 s20, s12, s13
-; GFX6-NEXT:    s_sub_i32 s14, s11, s20
-; GFX6-NEXT:    s_mul_i32 s12, s8, s18
+; GFX6-NEXT:    s_mul_i32 s13, s9, s16
+; GFX6-NEXT:    s_add_i32 s18, s12, s13
+; GFX6-NEXT:    s_sub_i32 s19, s11, s18
+; GFX6-NEXT:    s_mul_i32 s12, s8, s16
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
-; GFX6-NEXT:    s_subb_u32 s21, s14, s9
-; GFX6-NEXT:    s_sub_u32 s22, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, s9
+; GFX6-NEXT:    s_sub_u32 s20, s10, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s9
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_add_u32 s15, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s8
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s9
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s16, 1
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_add_u32 s22, s16, 2
+; GFX6-NEXT:    s_addc_u32 s23, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s11, s11, s20
+; GFX6-NEXT:    s_subb_u32 s11, s11, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
@@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
-; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
+; GFX6-NEXT:    s_cselect_b32 s9, s20, s17
+; GFX6-NEXT:    s_cselect_b32 s8, s19, s16
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s10, 0, s8
-; GFX6-NEXT:    s_subb_u32 s11, 0, s9
+; GFX6-NEXT:    s_sub_u32 s0, 0, s8
+; GFX6-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_mul_i32 s13, s11, s0
-; GFX6-NEXT:    s_mul_i32 s14, s10, s0
-; GFX6-NEXT:    s_add_i32 s1, s15, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GFX6-NEXT:    s_add_i32 s1, s1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX6-NEXT:    s_mul_i32 s15, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s13, s13, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    s_mul_i32 s14, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v4
-; GFX6-NEXT:    s_add_u32 s13, s13, s14
-; GFX6-NEXT:    s_addc_u32 s13, s15, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s14
-; GFX6-NEXT:    s_add_u32 s14, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s13
-; GFX6-NEXT:    s_mul_i32 s0, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s11, s11, s14
-; GFX6-NEXT:    s_mul_i32 s1, s10, s14
-; GFX6-NEXT:    s_add_i32 s0, s0, s11
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_mul_i32 s11, s14, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s11, s15, s11
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX6-NEXT:    s_add_u32 s1, s11, s1
-; GFX6-NEXT:    s_addc_u32 s1, s13, s10
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
-; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s0, s12, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_add_u32 s13, s14, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s11, s0, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX6-NEXT:    s_mul_i32 s12, s1, s2
+; GFX6-NEXT:    s_mul_i32 s13, s0, s2
+; GFX6-NEXT:    s_add_i32 s11, s14, s11
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GFX6-NEXT:    s_add_i32 s11, s11, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_mul_i32 s14, s2, s11
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GFX6-NEXT:    s_add_u32 s12, s12, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s13, s10, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v4
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s12, s14, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_mul_i32 s11, s10, s11
+; GFX6-NEXT:    s_add_u32 s11, s12, s11
+; GFX6-NEXT:    s_addc_u32 s12, 0, s13
+; GFX6-NEXT:    s_add_u32 s11, s2, s11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s10, s10, s12
+; GFX6-NEXT:    s_mul_i32 s12, s0, s10
+; GFX6-NEXT:    s_mul_i32 s1, s1, s11
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_add_i32 s12, s13, s12
+; GFX6-NEXT:    s_mul_i32 s0, s0, s11
+; GFX6-NEXT:    s_add_i32 s1, s12, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s0, s10, s0
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_add_u32 s0, s13, s0
+; GFX6-NEXT:    s_addc_u32 s0, s14, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s1, s10, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s11, s0
+; GFX6-NEXT:    s_addc_u32 s13, s10, s1
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s11, s10
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s12
+; GFX6-NEXT:    s_mul_i32 s1, s6, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s14, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s13, s7, s13
+; GFX6-NEXT:    s_mul_i32 s12, s7, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s12, s7, s12
+; GFX6-NEXT:    s_mul_i32 s12, s7, s13
 ; GFX6-NEXT:    s_add_u32 s12, s1, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
@@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s15, s13, s9
 ; GFX6-NEXT:    s_sub_u32 s16, s6, s8
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
@@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s15, s15, s9
-; GFX6-NEXT:    s_sub_u32 s19, s16, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_subb_u32 s12, s15, s9
+; GFX6-NEXT:    s_sub_u32 s13, s16, s8
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s16
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s14
@@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_sub_u32 s12, 0, s2
-; GFX6-NEXT:    s_subb_u32 s13, 0, s3
+; GFX6-NEXT:    s_sub_u32 s6, 0, s2
+; GFX6-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s7, s12, s14
+; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s6
-; GFX6-NEXT:    s_mul_i32 s16, s12, s6
-; GFX6-NEXT:    s_add_i32 s7, s17, s7
+; GFX6-NEXT:    s_mul_i32 s15, s7, s13
+; GFX6-NEXT:    s_mul_i32 s16, s6, s13
+; GFX6-NEXT:    s_add_i32 s14, s17, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s7, s7, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s7
+; GFX6-NEXT:    s_add_i32 s14, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s18, s6, s7
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s7
+; GFX6-NEXT:    s_mul_i32 s18, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s14
 ; GFX6-NEXT:    s_add_u32 s15, s15, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_mul_i32 s16, s12, s16
 ; GFX6-NEXT:    s_addc_u32 s18, 0, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
 ; GFX6-NEXT:    s_add_u32 s15, s15, s16
 ; GFX6-NEXT:    s_addc_u32 s15, s18, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_add_u32 s7, s15, s7
+; GFX6-NEXT:    s_mul_i32 s14, s12, s14
+; GFX6-NEXT:    s_add_u32 s14, s15, s14
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s6, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s6, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_add_i32 s6, s7, s6
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s7, s12, s16
-; GFX6-NEXT:    s_add_i32 s6, s6, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s6
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_add_u32 s13, s13, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s15
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
+; GFX6-NEXT:    s_mul_i32 s7, s7, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s7, s13, s7
-; GFX6-NEXT:    s_addc_u32 s7, s15, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s6, s14, s6
-; GFX6-NEXT:    s_add_u32 s6, s7, s6
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s16, s6
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s12, s14, s12
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s6, s6, s13
+; GFX6-NEXT:    s_add_i32 s7, s14, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s7
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s6, s12, s6
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s6, s15, s6
+; GFX6-NEXT:    s_addc_u32 s6, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s7, s12, s7
+; GFX6-NEXT:    s_add_u32 s6, s6, s7
+; GFX6-NEXT:    s_addc_u32 s7, 0, s14
+; GFX6-NEXT:    s_add_u32 s13, s13, s6
+; GFX6-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s14, s15
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s17, s17, s3
-; GFX6-NEXT:    s_sub_u32 s21, s18, s2
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_subb_u32 s14, s17, s3
+; GFX6-NEXT:    s_sub_u32 s15, s18, s2
+; GFX6-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16
@@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s8, 0, s6
-; GFX6-NEXT:    s_subb_u32 s9, 0, s7
+; GFX6-NEXT:    s_sub_u32 s2, 0, s6
+; GFX6-NEXT:    s_subb_u32 s3, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s9, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s8, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s9, s2, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s9, s12, s9
+; GFX6-NEXT:    s_add_i32 s9, s9, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s9
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s12, s0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s16, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s9
+; GFX6-NEXT:    s_add_u32 s12, s16, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    s_mul_i32 s1, s8, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s13
-; GFX6-NEXT:    s_addc_u32 s4, s5, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s12, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s12, s4
-; GFX6-NEXT:    s_mul_i32 s2, s8, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s9, s9, s5
-; GFX6-NEXT:    s_mul_i32 s3, s8, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s12, s1
+; GFX6-NEXT:    s_addc_u32 s1, s13, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s9, s8, s9
+; GFX6-NEXT:    s_add_u32 s1, s1, s9
+; GFX6-NEXT:    s_addc_u32 s9, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s8, s9
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    s_add_i32 s5, s8, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s12
+; GFX6-NEXT:    s_mul_i32 s2, s2, s12
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s9, s5, s2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_mul_i32 s8, s12, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX6-NEXT:    s_add_u32 s9, s13, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s12, s8
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX6-NEXT:    s_addc_u32 s8, s8, 0
+; GFX6-NEXT:    s_add_u32 s8, s13, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s12, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s13, s4, s8
+; GFX6-NEXT:    s_addc_u32 s9, 0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s8, s2
+; GFX6-NEXT:    s_addc_u32 s2, s9, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s12, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
@@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s13, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s17, s12, s7
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s6
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s17, s17, s7
-; GFX6-NEXT:    s_sub_u32 s21, s18, s6
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_subb_u32 s12, s17, s7
+; GFX6-NEXT:    s_sub_u32 s13, s18, s6
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s18
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16

diff  --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..8d05317162e9c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e3c533d..71af21a11c2ce 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
-; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -88,15 +86,13 @@ bb:
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_add_i32 s12, s12, s17
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s2, s2
-; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_addc_u32 s0, s2, 0
+; GFX7-NEXT:    s_add_u32 s1, s0, s0
+; GFX7-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2

diff  --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c494343b..fa452f3717f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2277,3 +2277,181 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+    %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+# Do not delete s_or_b32 because of intervening def of scc
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+    %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+# Do not delete s_or_b32 since both operands are sub1.
+name:            s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+    %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+    %41:sreg_32 = COPY %31.sub1:sreg_64_xexec
+    %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name:            s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+body:             |
+  ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+  ; GCN-NEXT:   S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+  ; GCN-NEXT:   %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    S_CMP_LG_U32 %2, 0, implicit-def $scc
+    %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+    %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc
+    S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94a7f245..74a6d7fe39362 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s8, s1, 31
 ; GCN-NEXT:    s_add_u32 s0, s0, s8
@@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT:    s_sub_u32 s12, 0, s10
-; GCN-NEXT:    s_subb_u32 s13, 0, s11
+; GCN-NEXT:    s_sub_u32 s0, 0, s10
+; GCN-NEXT:    s_subb_u32 s1, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_mul_i32 s15, s13, s0
-; GCN-NEXT:    s_mul_i32 s16, s12, s0
-; GCN-NEXT:    s_add_i32 s1, s17, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GCN-NEXT:    s_add_i32 s1, s1, s15
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GCN-NEXT:    v_readfirstlane_b32 s15, v3
-; GCN-NEXT:    s_mul_i32 s17, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s15, s15, s17
-; GCN-NEXT:    v_readfirstlane_b32 s17, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s17
-; GCN-NEXT:    s_mul_i32 s16, s14, s16
-; GCN-NEXT:    v_readfirstlane_b32 s18, v4
-; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_addc_u32 s15, s17, s18
-; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_addc_u32 s16, s16, 0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_add_u32 s1, s15, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s16
-; GCN-NEXT:    s_add_u32 s16, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s15
-; GCN-NEXT:    s_mul_i32 s0, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s1, s12, s16
-; GCN-NEXT:    s_add_i32 s0, s0, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GCN-NEXT:    s_mul_i32 s13, s16, s0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_add_u32 s13, s17, s13
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s1, s15, s12
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    s_addc_u32 s12, s12, 0
-; GCN-NEXT:    s_mul_i32 s0, s14, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    s_add_u32 s15, s16, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s13, s0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s16, v2
+; GCN-NEXT:    s_mul_i32 s14, s1, s2
+; GCN-NEXT:    s_mul_i32 s15, s0, s2
+; GCN-NEXT:    s_add_i32 s13, s16, s13
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_mul_i32 s16, s2, s13
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GCN-NEXT:    s_add_u32 s14, s14, s16
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s15, s12, s15
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s17, v4
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s14, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s15, s15, 0
+; GCN-NEXT:    s_mul_i32 s13, s12, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
+; GCN-NEXT:    s_addc_u32 s14, 0, s15
+; GCN-NEXT:    s_add_u32 s13, s2, s13
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s12, s12, s14
+; GCN-NEXT:    s_mul_i32 s14, s0, s12
+; GCN-NEXT:    s_mul_i32 s1, s1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v0
+; GCN-NEXT:    s_add_i32 s14, s15, s14
+; GCN-NEXT:    s_mul_i32 s0, s0, s13
+; GCN-NEXT:    s_add_i32 s1, s14, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT:    s_mul_i32 s15, s13, s1
+; GCN-NEXT:    v_readfirstlane_b32 s17, v2
+; GCN-NEXT:    s_add_u32 s15, s17, s15
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s0, s12, s0
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_add_u32 s0, s15, s0
+; GCN-NEXT:    s_addc_u32 s0, s16, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v1
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s1, s12, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s14
+; GCN-NEXT:    s_add_u32 s14, s13, s0
+; GCN-NEXT:    s_addc_u32 s15, s12, s1
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    s_addc_u32 s1, s7, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GCN-NEXT:    s_mul_i32 s1, s6, s14
+; GCN-NEXT:    s_mul_i32 s1, s6, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v3
 ; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    s_add_u32 s1, s16, s1
 ; GCN-NEXT:    s_addc_u32 s4, 0, s4
-; GCN-NEXT:    s_mul_i32 s15, s7, s15
+; GCN-NEXT:    s_mul_i32 s14, s7, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_add_u32 s1, s1, s15
+; GCN-NEXT:    s_add_u32 s1, s1, s14
 ; GCN-NEXT:    s_addc_u32 s1, s4, s16
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
-; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s16, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    s_mul_i32 s14, s7, s15
+; GCN-NEXT:    s_add_u32 s14, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s4
+; GCN-NEXT:    s_addc_u32 s15, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s17
+; GCN-NEXT:    s_mul_i32 s4, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s16
-; GCN-NEXT:    s_add_i32 s18, s4, s5
-; GCN-NEXT:    s_sub_i32 s14, s7, s18
-; GCN-NEXT:    s_mul_i32 s4, s10, s16
+; GCN-NEXT:    s_mul_i32 s5, s11, s14
+; GCN-NEXT:    s_add_i32 s16, s4, s5
+; GCN-NEXT:    s_sub_i32 s17, s7, s16
+; GCN-NEXT:    s_mul_i32 s4, s10, s14
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s4, s5
-; GCN-NEXT:    s_subb_u32 s19, s14, s11
-; GCN-NEXT:    s_sub_u32 s20, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s14, s15
-; GCN-NEXT:    s_subb_u32 s14, s19, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, s11
+; GCN-NEXT:    s_sub_u32 s18, s6, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s11
 ; GCN-NEXT:    s_cselect_b32 s19, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s14, s19, s15
-; GCN-NEXT:    s_add_u32 s15, s16, 1
-; GCN-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-NEXT:    s_add_u32 s20, s16, 2
-; GCN-NEXT:    s_addc_u32 s21, s17, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s14, s20, s15
-; GCN-NEXT:    s_cselect_b32 s15, s21, s19
+; GCN-NEXT:    s_cmp_ge_u32 s18, s10
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s11
+; GCN-NEXT:    s_cselect_b32 s17, s18, s19
+; GCN-NEXT:    s_add_u32 s18, s14, 1
+; GCN-NEXT:    s_addc_u32 s19, s15, 0
+; GCN-NEXT:    s_add_u32 s20, s14, 2
+; GCN-NEXT:    s_addc_u32 s21, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s17, 0
+; GCN-NEXT:    s_cselect_b32 s17, s20, s18
+; GCN-NEXT:    s_cselect_b32 s18, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_subb_u32 s4, s7, s18
+; GCN-NEXT:    s_subb_u32 s4, s7, s16
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
@@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s4, s6, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s15, s17
-; GCN-NEXT:    s_cselect_b32 s4, s14, s16
+; GCN-NEXT:    s_cselect_b32 s5, s18, s15
+; GCN-NEXT:    s_cselect_b32 s4, s17, s14
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
 ; GCN-NEXT:    s_subb_u32 s5, s5, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s16, s16, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, s17, s21
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s2, 0, s6
-; GCN-NEXT:    s_subb_u32 s10, 0, s7
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s8, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_mul_i32 s12, s10, s8
-; GCN-NEXT:    s_mul_i32 s13, s2, s8
-; GCN-NEXT:    s_add_i32 s9, s14, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
-; GCN-NEXT:    s_add_i32 s9, s9, s12
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
-; GCN-NEXT:    s_add_u32 s12, s12, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s13, s11, s13
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s14, v4
-; GCN-NEXT:    s_add_u32 s12, s12, s13
-; GCN-NEXT:    s_addc_u32 s12, s15, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s9, s11, s9
-; GCN-NEXT:    s_add_u32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s12, 0, s13
-; GCN-NEXT:    s_add_u32 s13, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s10, s2, s9
+; GCN-NEXT:    v_readfirstlane_b32 s13, v2
+; GCN-NEXT:    s_mul_i32 s11, s8, s3
+; GCN-NEXT:    s_mul_i32 s12, s2, s3
+; GCN-NEXT:    s_add_i32 s10, s13, s10
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
+; GCN-NEXT:    s_add_i32 s10, s10, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s10
+; GCN-NEXT:    s_mul_i32 s14, s3, s10
+; GCN-NEXT:    s_add_u32 s11, s11, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s12, s9, s12
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v4
+; GCN-NEXT:    s_add_u32 s11, s11, s12
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s11, s14, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, 0
+; GCN-NEXT:    s_mul_i32 s10, s9, s10
+; GCN-NEXT:    s_add_u32 s10, s11, s10
+; GCN-NEXT:    s_addc_u32 s11, 0, s12
+; GCN-NEXT:    s_add_u32 s10, s3, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s11, s11, s12
-; GCN-NEXT:    s_mul_i32 s8, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s10, s10, s13
-; GCN-NEXT:    s_mul_i32 s2, s2, s13
-; GCN-NEXT:    s_add_i32 s8, s8, s10
+; GCN-NEXT:    s_addc_u32 s9, s9, s11
+; GCN-NEXT:    s_mul_i32 s11, s2, s9
+; GCN-NEXT:    s_mul_i32 s8, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_add_i32 s11, s12, s11
+; GCN-NEXT:    s_mul_i32 s2, s2, s10
+; GCN-NEXT:    s_add_i32 s8, s11, s8
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GCN-NEXT:    s_mul_i32 s10, s13, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_mul_i32 s12, s10, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_add_u32 s10, s14, s10
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s2, s11, s2
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v3
-; GCN-NEXT:    s_add_u32 s2, s10, s2
-; GCN-NEXT:    s_addc_u32 s2, s12, s9
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-NEXT:    s_mul_i32 s8, s11, s8
+; GCN-NEXT:    s_add_u32 s12, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s13, v0
+; GCN-NEXT:    s_mul_i32 s2, s9, s2
+; GCN-NEXT:    s_addc_u32 s13, 0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    s_add_u32 s2, s12, s2
+; GCN-NEXT:    s_addc_u32 s2, s13, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-NEXT:    s_mul_i32 s8, s9, s8
 ; GCN-NEXT:    s_add_u32 s2, s2, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    s_add_u32 s2, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s8, s11, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s11
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s8, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
 ; GCN-NEXT:    s_mul_i32 s8, s8, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s12, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    s_addc_u32 s10, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s12
+; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s13, s9, s8
-; GCN-NEXT:    s_sub_i32 s10, 0, s13
-; GCN-NEXT:    s_mul_i32 s8, s6, s12
-; GCN-NEXT:    s_sub_u32 s14, 24, s8
+; GCN-NEXT:    s_add_i32 s11, s9, s8
+; GCN-NEXT:    s_sub_i32 s12, 0, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s10
+; GCN-NEXT:    s_sub_u32 s13, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s8, s9
-; GCN-NEXT:    s_subb_u32 s15, s10, s7
-; GCN-NEXT:    s_sub_u32 s16, s14, s6
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s15, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, s7
+; GCN-NEXT:    s_sub_u32 s14, s13, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s7
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s10, s15, s11
-; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cselect_b32 s14, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s12, s7
+; GCN-NEXT:    s_cselect_b32 s12, s14, s15
+; GCN-NEXT:    s_add_u32 s14, s10, 1
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_add_u32 s16, s10, 2
 ; GCN-NEXT:    s_addc_u32 s17, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_cselect_b32 s10, s16, s11
-; GCN-NEXT:    s_cselect_b32 s11, s17, s15
+; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_cselect_b32 s12, s16, s14
+; GCN-NEXT:    s_cselect_b32 s14, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, 0, s13
+; GCN-NEXT:    s_subb_u32 s8, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cmp_ge_u32 s13, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s11, 0
-; GCN-NEXT:    s_cselect_b32 s6, s10, s12
+; GCN-NEXT:    s_cselect_b32 s7, s14, 0
+; GCN-NEXT:    s_cselect_b32 s6, s12, s10
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb0417dfa4..862e2dd2de051 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GCN-NEXT:    s_sub_u32 s10, 0, s4
-; GCN-NEXT:    s_subb_u32 s11, 0, s5
+; GCN-NEXT:    s_sub_u32 s8, 0, s4
+; GCN-NEXT:    s_subb_u32 s9, 0, s5
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s8
-; GCN-NEXT:    s_mul_i32 s14, s10, s8
-; GCN-NEXT:    s_add_i32 s9, s15, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s9, s9, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    s_add_u32 s13, s13, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, v1, s9
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s9, s2
+; GCN-NEXT:    s_mul_i32 s13, s8, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s14, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s13, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s8, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s9, s10, s14
-; GCN-NEXT:    s_add_i32 s8, s8, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s8
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s8, s10
+; GCN-NEXT:    s_mul_i32 s9, s9, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s9, s11, s9
-; GCN-NEXT:    s_addc_u32 s9, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s8, s12, s8
-; GCN-NEXT:    s_add_u32 s8, s9, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s10, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s8, s8, s11
+; GCN-NEXT:    s_add_i32 s9, s12, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s8, s10, s8
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s8, s13, s8
+; GCN-NEXT:    s_addc_u32 s8, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s9, s10, s9
+; GCN-NEXT:    s_add_u32 s8, s8, s9
+; GCN-NEXT:    s_addc_u32 s9, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s8
+; GCN-NEXT:    s_addc_u32 s10, s10, s9
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
 ; GCN-NEXT:    s_mov_b32 s9, s8
@@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_addc_u32 s11, 0, s12
 ; GCN-NEXT:    s_mul_i32 s11, s4, s11
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
@@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s13, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s12, s5
 ; GCN-NEXT:    s_sub_u32 s16, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s17, s12, s13
 ; GCN-NEXT:    s_subb_u32 s17, s15, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, -1, 0
@@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_cmp_eq_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, s19, s18
 ; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s15, s15, s5
-; GCN-NEXT:    s_sub_u32 s19, s16, s4
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_subb_u32 s12, s15, s5
+; GCN-NEXT:    s_sub_u32 s13, s16, s4
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s13, s13, s16
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-NEXT:    s_subb_u32 s7, s7, s14
@@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_sub_u32 s14, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, s15, s21
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GCN-NEXT:    s_sub_u32 s2, 0, s4
-; GCN-NEXT:    s_subb_u32 s8, 0, s5
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s6, 0, s5
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_mul_i32 s7, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s6
-; GCN-NEXT:    s_mul_i32 s11, s2, s6
-; GCN-NEXT:    s_add_i32 s7, s12, s7
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s7, s7, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_mul_i32 s13, s6, s7
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s7
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s7, s9, s7
-; GCN-NEXT:    s_add_u32 s7, s10, s7
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s6, s7
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s8, s2, s7
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s6, s3
+; GCN-NEXT:    s_mul_i32 s10, s2, s3
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_mul_i32 s12, s3, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s7, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_mul_i32 s8, s7, s8
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s3, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s6, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s7, v0
-; GCN-NEXT:    s_add_i32 s6, s7, s6
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s2, s2, s11
-; GCN-NEXT:    s_add_i32 s6, s6, s8
+; GCN-NEXT:    s_addc_u32 s7, s7, s9
+; GCN-NEXT:    s_mul_i32 s9, s2, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s2, s2, s8
+; GCN-NEXT:    s_add_i32 s6, s9, s6
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s6
+; GCN-NEXT:    v_mul_hi_u32 v3, s7, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s2, s9, s2
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s7, v3
-; GCN-NEXT:    s_add_u32 s2, s8, s2
-; GCN-NEXT:    s_addc_u32 s2, s10, s7
-; GCN-NEXT:    v_readfirstlane_b32 s7, v1
-; GCN-NEXT:    s_addc_u32 s7, s7, 0
-; GCN-NEXT:    s_mul_i32 s6, s9, s6
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s2, s7, s2
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s2, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s6, s7, s6
 ; GCN-NEXT:    s_add_u32 s2, s2, s6
-; GCN-NEXT:    s_addc_u32 s8, 0, s7
-; GCN-NEXT:    s_add_u32 s2, s11, s2
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s6, s9, s8
+; GCN-NEXT:    s_addc_u32 s6, 0, s9
+; GCN-NEXT:    s_add_u32 s2, s8, s2
+; GCN-NEXT:    s_addc_u32 s6, s7, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
 ; GCN-NEXT:    s_mul_i32 s6, s6, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_add_u32 s6, s8, s6
@@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    s_add_i32 s10, s8, s7
 ; GCN-NEXT:    s_sub_i32 s8, 0, s10
 ; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s6, s7
 ; GCN-NEXT:    s_subb_u32 s12, s8, s5
 ; GCN-NEXT:    s_sub_u32 s13, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s5
-; GCN-NEXT:    s_sub_u32 s16, s13, s4
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s5
+; GCN-NEXT:    s_sub_u32 s9, s13, s4
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-NEXT:    s_subb_u32 s6, 0, s10
@@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25e91c8..b000fae124ede 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_add_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac80ea55..775483c040b7f 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s10, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s10
+; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s11, s1, s0
-; GCN-NEXT:    s_sub_i32 s8, 0, s11
-; GCN-NEXT:    s_mul_i32 s0, s2, s10
-; GCN-NEXT:    s_sub_u32 s12, 24, s0
+; GCN-NEXT:    s_add_i32 s9, s1, s0
+; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_mul_i32 s0, s2, s8
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s0, s1
-; GCN-NEXT:    s_subb_u32 s13, s8, s3
-; GCN-NEXT:    s_sub_u32 s14, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, s3
+; GCN-NEXT:    s_sub_u32 s12, s11, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s3
 ; GCN-NEXT:    s_cselect_b32 s13, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s8, s13, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s3
+; GCN-NEXT:    s_cselect_b32 s10, s12, s13
+; GCN-NEXT:    s_add_u32 s12, s8, 1
 ; GCN-NEXT:    s_addc_u32 s13, 0, 0
-; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_add_u32 s14, s8, 2
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s14, s9
-; GCN-NEXT:    s_cselect_b32 s9, s15, s13
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s14, s12
+; GCN-NEXT:    s_cselect_b32 s12, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_subb_u32 s0, 0, s11
+; GCN-NEXT:    s_subb_u32 s0, 0, s9
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s9, 0
-; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    s_cselect_b32 s0, s12, 0
+; GCN-NEXT:    s_cselect_b32 s1, s10, s8
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s8
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1fe42294..28e6627b87413 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_urem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
@@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
 ; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s0, s1
 ; GCN-NEXT:    s_subb_u32 s12, s9, s3
 ; GCN-NEXT:    s_sub_u32 s13, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s3
-; GCN-NEXT:    s_sub_u32 s16, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s3
+; GCN-NEXT:    s_sub_u32 s9, s13, s2
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_subb_u32 s0, 0, s10
@@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_lg_u32 s1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, s8, s0
 ; GCN-NEXT:    s_cselect_b32 s1, s9, s11
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
 ; GCN-IR-NEXT:    s_subb_u32 s9, s9, 0
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]

diff  --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db6471b6a46..8a54ad301f48a 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_sub_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1


        


More information about the llvm-commits mailing list