[llvm] [AMDGPU] Delete redundant s_cmp_or32 (PR #165261)

via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 27 08:05:40 PDT 2025


https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/165261

Transform sequences like:

  s_cselect_b64 s12, -1, 0
  s_or_b32 s6, s12, s13

where s6 is dead to: 

  s_cselect_b64 s12, -1, 0

>From 10f5d95b3f3ae08c77729362fd09c4f4d8325454 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 22 Oct 2025 12:10:21 -0500
Subject: [PATCH 1/2] Refactor code

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 61 +++++++++++++-------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d930a21c2d7f5..9dd3bedf38bd7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10628,7 +10628,31 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
     return false;
 
-  const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+  // SCC is already valid after SCCValid.
+  // SCCRedefine will redefine SCC to the same value already available after
+  // SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
+  // update kill/dead flags if necessary.
+  const auto optimizeSCC = [this](MachineInstr *SCCValid,
+                                  MachineInstr *SCCRedefine) -> bool {
+    MachineInstr *KillsSCC = nullptr;
+    for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
+                                       SCCRedefine->getIterator())) {
+      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+        return false;
+      if (MI.killsRegister(AMDGPU::SCC, &RI))
+        KillsSCC = &MI;
+    }
+    if (MachineOperand *SccDef =
+            SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+      SccDef->setIsDead(false);
+    if (KillsSCC)
+      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+    SCCRedefine->eraseFromParent();
+
+    return true;
+  };
+
+  const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, optimizeSCC,
                                   this]() -> bool {
     if (CmpValue != 0)
       return false;
@@ -10663,25 +10687,13 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
+    if (!optimizeSCC(Def, &CmpInstr))
+      return false;
 
-    if (MachineOperand *SccDef =
-            Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
-      SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
     return true;
   };
 
-  const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
+  const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, optimizeSCC,
                                this](int64_t ExpectedValue, unsigned SrcSize,
                                      bool IsReversible, bool IsSigned) -> bool {
     // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
@@ -10755,21 +10767,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    MachineInstr *KillsSCC = nullptr;
-    for (MachineInstr &MI :
-         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
-      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
-        return false;
-      if (MI.killsRegister(AMDGPU::SCC, &RI))
-        KillsSCC = &MI;
-    }
-
-    MachineOperand *SccDef =
-        Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
-    SccDef->setIsDead(false);
-    if (KillsSCC)
-      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
-    CmpInstr.eraseFromParent();
+    if (!optimizeSCC(Def, &CmpInstr))
+      return false;
 
     if (!MRI->use_nodbg_empty(DefReg)) {
       assert(!IsReversedCC);

>From 559ef0a6afa37b54c7127d85baa816144475f48a Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 22 Oct 2025 12:59:54 -0500
Subject: [PATCH 2/2] Delete redundant s_or_b32

Signed-off-by: John Lu <John.Lu at amd.com>
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  21 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   2 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 941 +++++++++---------
 .../test/CodeGen/AMDGPU/carryout-selection.ll |   4 -
 .../expand-scalar-carry-out-select-user.ll    |  10 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            | 368 ++++---
 llvm/test/CodeGen/AMDGPU/srem64.ll            | 410 ++++----
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |   6 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            | 199 ++--
 llvm/test/CodeGen/AMDGPU/urem64.ll            | 296 +++---
 llvm/test/CodeGen/AMDGPU/usubo.ll             |   6 +-
 11 files changed, 1058 insertions(+), 1205 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9dd3bedf38bd7..2172e733ef732 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10160,7 +10160,7 @@ static bool followSubRegDef(MachineInstr &MI,
 }
 
 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                                     MachineRegisterInfo &MRI) {
+                                     const MachineRegisterInfo &MRI) {
   assert(MRI.isSSA());
   if (!P.Reg.isVirtual())
     return nullptr;
@@ -10690,6 +10690,25 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (!optimizeSCC(Def, &CmpInstr))
       return false;
 
+    // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
+    // a register pair) and the input is a 64-bit foldableSelect then transform:
+    //
+    //   (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64
+    //   (non-zero
+    //     imm), 0)
+    if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+        MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+      MachineOperand OrOpnd1 = Def->getOperand(1);
+      MachineOperand OrOpnd2 = Def->getOperand(2);
+
+      if (OrOpnd1.isReg() && OrOpnd2.isReg() &&
+          OrOpnd1.getReg() != OrOpnd2.getReg()) {
+        auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI);
+        auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI);
+        if (Def1 == Def2 && foldableSelect(Def1))
+          optimizeSCC(Def1, Def);
+      }
+    }
     return true;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5fdeddaf3f736..f7caae9e257bc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
 /// skipping copy like instructions and subreg-manipulation pseudos.
 /// Following another subreg of a reg:subreg isn't supported.
 MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
-                               MachineRegisterInfo &MRI);
+                               const MachineRegisterInfo &MRI);
 
 /// \brief Return false if EXEC is not changed between the def of \p VReg at \p
 /// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c34cc55e..54b1554ae5d04 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
 ; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
@@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT:    s_sub_u32 s12, 0, s10
-; GFX6-NEXT:    s_subb_u32 s13, 0, s11
+; GFX6-NEXT:    s_sub_u32 s0, 0, s10
+; GFX6-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s0
-; GFX6-NEXT:    s_mul_i32 s16, s12, s0
-; GFX6-NEXT:    s_add_i32 s1, s17, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s1, s1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s17, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s15, s15, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s18, v4
-; GFX6-NEXT:    s_add_u32 s15, s15, s16
-; GFX6-NEXT:    s_addc_u32 s15, s17, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_add_u32 s1, s15, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s0, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
-; GFX6-NEXT:    s_add_i32 s0, s0, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s1, s14, s1
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s1, s15, s12
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s0, s14, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s15, s16, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s14, s14, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s13, s0, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s14, s1, s2
+; GFX6-NEXT:    s_mul_i32 s15, s0, s2
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GFX6-NEXT:    s_add_i32 s13, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_mul_i32 s16, s2, s13
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s14, s14, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s15, s12, s15
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
+; GFX6-NEXT:    s_add_u32 s14, s14, s15
+; GFX6-NEXT:    s_addc_u32 s14, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
+; GFX6-NEXT:    s_addc_u32 s15, s15, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s13, s14, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s15
+; GFX6-NEXT:    s_add_u32 s13, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s14
+; GFX6-NEXT:    s_mul_i32 s14, s0, s12
+; GFX6-NEXT:    s_mul_i32 s1, s1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s0, s0, s13
+; GFX6-NEXT:    s_add_i32 s1, s14, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s0, s12, s0
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s0, s15, s0
+; GFX6-NEXT:    s_addc_u32 s0, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s14
+; GFX6-NEXT:    s_add_u32 s14, s13, s0
+; GFX6-NEXT:    s_addc_u32 s15, s12, s1
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s12
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s15
+; GFX6-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s14
+; GFX6-NEXT:    s_mul_i32 s1, s6, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s16, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s15, s7, s15
+; GFX6-NEXT:    s_mul_i32 s14, s7, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s15
+; GFX6-NEXT:    s_add_u32 s1, s1, s14
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s16, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s7, s15
+; GFX6-NEXT:    s_add_u32 s14, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s4
+; GFX6-NEXT:    s_addc_u32 s15, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s17
+; GFX6-NEXT:    s_mul_i32 s4, s10, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s16
-; GFX6-NEXT:    s_add_i32 s18, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s18
-; GFX6-NEXT:    s_mul_i32 s4, s10, s16
+; GFX6-NEXT:    s_mul_i32 s5, s11, s14
+; GFX6-NEXT:    s_add_i32 s16, s4, s5
+; GFX6-NEXT:    s_sub_i32 s17, s7, s16
+; GFX6-NEXT:    s_mul_i32 s4, s10, s14
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s4, s5
-; GFX6-NEXT:    s_subb_u32 s19, s14, s11
-; GFX6-NEXT:    s_sub_u32 s20, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, s11
+; GFX6-NEXT:    s_sub_u32 s18, s6, s10
+; GFX6-NEXT:    s_subb_u32 s17, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s11
 ; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s20, s16, 2
-; GFX6-NEXT:    s_addc_u32 s21, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s10
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s11
+; GFX6-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX6-NEXT:    s_add_u32 s18, s14, 1
+; GFX6-NEXT:    s_addc_u32 s19, s15, 0
+; GFX6-NEXT:    s_add_u32 s20, s14, 2
+; GFX6-NEXT:    s_addc_u32 s21, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX6-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_subb_u32 s4, s7, s18
+; GFX6-NEXT:    s_subb_u32 s4, s7, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
@@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
+; GFX6-NEXT:    s_cselect_b32 s5, s18, s15
+; GFX6-NEXT:    s_cselect_b32 s4, s17, s14
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s14, 0, s6
-; GFX6-NEXT:    s_subb_u32 s15, 0, s7
+; GFX6-NEXT:    s_sub_u32 s12, 0, s6
+; GFX6-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s13, s14, s16
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_mul_i32 s17, s15, s12
-; GFX6-NEXT:    s_mul_i32 s18, s14, s12
-; GFX6-NEXT:    s_add_i32 s13, s19, s13
+; GFX6-NEXT:    s_mul_i32 s17, s13, s15
+; GFX6-NEXT:    s_mul_i32 s18, s12, s15
+; GFX6-NEXT:    s_add_i32 s16, s19, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s18
-; GFX6-NEXT:    s_add_i32 s13, s13, s17
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT:    s_add_i32 s16, s16, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
-; GFX6-NEXT:    s_mul_i32 s20, s12, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_mul_i32 s20, s15, s16
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s16
 ; GFX6-NEXT:    s_add_u32 s17, s17, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s20, v0
-; GFX6-NEXT:    s_mul_i32 s18, s16, s18
+; GFX6-NEXT:    s_mul_i32 s18, s14, s18
 ; GFX6-NEXT:    s_addc_u32 s20, 0, s20
 ; GFX6-NEXT:    v_readfirstlane_b32 s19, v4
 ; GFX6-NEXT:    s_add_u32 s17, s17, s18
 ; GFX6-NEXT:    s_addc_u32 s17, s20, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v1
 ; GFX6-NEXT:    s_addc_u32 s18, s18, 0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_add_u32 s16, s17, s16
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
-; GFX6-NEXT:    s_add_u32 s18, s12, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s16, s16, s17
-; GFX6-NEXT:    s_mul_i32 s12, s14, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s15, s15, s18
-; GFX6-NEXT:    s_mul_i32 s13, s14, s18
-; GFX6-NEXT:    s_add_i32 s12, s12, s15
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mul_hi_u32 v3, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s18, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s18, v0
-; GFX6-NEXT:    s_mul_i32 s15, s18, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX6-NEXT:    s_add_u32 s15, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s15, s16
+; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_addc_u32 s14, s14, s17
+; GFX6-NEXT:    s_mul_i32 s16, s12, s14
+; GFX6-NEXT:    s_mul_i32 s13, s13, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s13
-; GFX6-NEXT:    s_addc_u32 s17, 0, s17
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    s_addc_u32 s13, s17, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s12, s16, s12
-; GFX6-NEXT:    s_add_u32 s12, s13, s12
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_add_u32 s15, s18, s12
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_addc_u32 s14, s16, s14
+; GFX6-NEXT:    s_add_i32 s16, s17, s16
+; GFX6-NEXT:    s_mul_i32 s12, s12, s15
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX6-NEXT:    s_mul_i32 s17, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX6-NEXT:    s_add_u32 s17, s19, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
+; GFX6-NEXT:    s_mul_i32 s12, s14, s12
+; GFX6-NEXT:    s_addc_u32 s18, 0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v3
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    s_addc_u32 s12, s18, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s14, s13
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s15, s15, s12
+; GFX6-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
 ; GFX6-NEXT:    s_mov_b32 s13, s12
@@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s18, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    s_add_u32 s17, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s17
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s19
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s18
-; GFX6-NEXT:    s_add_i32 s20, s14, s15
-; GFX6-NEXT:    s_sub_i32 s16, s9, s20
-; GFX6-NEXT:    s_mul_i32 s14, s6, s18
+; GFX6-NEXT:    s_mul_i32 s15, s7, s17
+; GFX6-NEXT:    s_add_i32 s18, s14, s15
+; GFX6-NEXT:    s_sub_i32 s19, s9, s18
+; GFX6-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s14, s15
-; GFX6-NEXT:    s_subb_u32 s21, s16, s7
-; GFX6-NEXT:    s_sub_u32 s22, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s16, s17
-; GFX6-NEXT:    s_subb_u32 s16, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, s7
+; GFX6-NEXT:    s_sub_u32 s20, s8, s6
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX6-NEXT:    s_add_u32 s17, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s17, 1
+; GFX6-NEXT:    s_addc_u32 s21, s16, 0
+; GFX6-NEXT:    s_add_u32 s22, s17, 2
+; GFX6-NEXT:    s_addc_u32 s23, s16, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s9, s9, s20
+; GFX6-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX6-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX6-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s16, s6, s2
-; GFX6-NEXT:    s_subb_u32 s17, s7, s3
+; GFX6-NEXT:    s_sub_u32 s14, s6, s2
+; GFX6-NEXT:    s_subb_u32 s15, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s12, 0, s8
-; GFX6-NEXT:    s_subb_u32 s13, 0, s9
+; GFX6-NEXT:    s_sub_u32 s2, 0, s8
+; GFX6-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s13, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s12, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s13, s2, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s13, s16, s13
+; GFX6-NEXT:    s_add_i32 s13, s13, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s16, s0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s18, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT:    s_add_u32 s16, s18, s16
+; GFX6-NEXT:    s_addc_u32 s17, 0, s17
+; GFX6-NEXT:    s_mul_i32 s1, s12, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
-; GFX6-NEXT:    s_addc_u32 s4, s5, s18
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
-; GFX6-NEXT:    s_mul_i32 s2, s12, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s13, s13, s5
-; GFX6-NEXT:    s_mul_i32 s3, s12, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s16, s1
+; GFX6-NEXT:    s_addc_u32 s1, s17, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    s_addc_u32 s16, s16, 0
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_addc_u32 s13, 0, s16
+; GFX6-NEXT:    s_add_u32 s16, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s12, s13
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s5, s12, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s16
+; GFX6-NEXT:    s_mul_i32 s2, s2, s16
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s13, s15, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT:    s_mul_i32 s12, s16, s3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s12, s17, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s12, s4, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s12, s2
+; GFX6-NEXT:    s_addc_u32 s2, s13, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s16, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s2, s10, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s15, s2
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
-; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
-; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s14, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_add_u32 s2, s17, s2
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s18, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s18
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    s_add_u32 s2, s2, s12
+; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s13
+; GFX6-NEXT:    s_add_u32 s16, s2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s19, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s19
+; GFX6-NEXT:    s_addc_u32 s17, 0, s12
+; GFX6-NEXT:    s_mul_i32 s12, s8, s17
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s18
-; GFX6-NEXT:    s_add_i32 s20, s12, s13
-; GFX6-NEXT:    s_sub_i32 s14, s11, s20
-; GFX6-NEXT:    s_mul_i32 s12, s8, s18
+; GFX6-NEXT:    s_mul_i32 s13, s9, s16
+; GFX6-NEXT:    s_add_i32 s18, s12, s13
+; GFX6-NEXT:    s_sub_i32 s19, s11, s18
+; GFX6-NEXT:    s_mul_i32 s12, s8, s16
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
-; GFX6-NEXT:    s_subb_u32 s21, s14, s9
-; GFX6-NEXT:    s_sub_u32 s22, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s21, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, s9
+; GFX6-NEXT:    s_sub_u32 s20, s10, s8
+; GFX6-NEXT:    s_subb_u32 s19, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s9
 ; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_add_u32 s15, s18, 1
-; GFX6-NEXT:    s_addc_u32 s21, s19, 0
-; GFX6-NEXT:    s_add_u32 s22, s18, 2
-; GFX6-NEXT:    s_addc_u32 s23, s19, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s8
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s9
+; GFX6-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX6-NEXT:    s_add_u32 s20, s16, 1
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_add_u32 s22, s16, 2
+; GFX6-NEXT:    s_addc_u32 s23, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX6-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s11, s11, s20
+; GFX6-NEXT:    s_subb_u32 s11, s11, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
@@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
 ; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
-; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
+; GFX6-NEXT:    s_cselect_b32 s9, s20, s17
+; GFX6-NEXT:    s_cselect_b32 s8, s19, s16
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mov_b32_e32 v1, s17
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    s_sub_u32 s10, 0, s8
-; GFX6-NEXT:    s_subb_u32 s11, 0, s9
+; GFX6-NEXT:    s_sub_u32 s0, 0, s8
+; GFX6-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    s_mul_i32 s1, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_mul_i32 s13, s11, s0
-; GFX6-NEXT:    s_mul_i32 s14, s10, s0
-; GFX6-NEXT:    s_add_i32 s1, s15, s1
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GFX6-NEXT:    s_add_i32 s1, s1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v3
-; GFX6-NEXT:    s_mul_i32 s15, s0, s1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT:    s_add_u32 s13, s13, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    s_mul_i32 s14, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v4
-; GFX6-NEXT:    s_add_u32 s13, s13, s14
-; GFX6-NEXT:    s_addc_u32 s13, s15, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_addc_u32 s14, s14, 0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_add_u32 s1, s13, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s14
-; GFX6-NEXT:    s_add_u32 s14, s0, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s13
-; GFX6-NEXT:    s_mul_i32 s0, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
-; GFX6-NEXT:    s_add_i32 s0, s1, s0
-; GFX6-NEXT:    s_mul_i32 s11, s11, s14
-; GFX6-NEXT:    s_mul_i32 s1, s10, s14
-; GFX6-NEXT:    s_add_i32 s0, s0, s11
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT:    s_mul_i32 s11, s14, s0
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s11, s15, s11
-; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s1
-; GFX6-NEXT:    s_addc_u32 s13, 0, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX6-NEXT:    s_add_u32 s1, s11, s1
-; GFX6-NEXT:    s_addc_u32 s1, s13, s10
+; GFX6-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
-; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s0, s12, s0
-; GFX6-NEXT:    s_add_u32 s0, s1, s0
-; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_add_u32 s13, s14, s0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_addc_u32 s12, s12, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX6-NEXT:    s_mul_i32 s11, s0, s10
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX6-NEXT:    s_mul_i32 s12, s1, s2
+; GFX6-NEXT:    s_mul_i32 s13, s0, s2
+; GFX6-NEXT:    s_add_i32 s11, s14, s11
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GFX6-NEXT:    s_add_i32 s11, s11, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_mul_i32 s14, s2, s11
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GFX6-NEXT:    s_add_u32 s12, s12, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s13, s10, s13
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v4
+; GFX6-NEXT:    s_add_u32 s12, s12, s13
+; GFX6-NEXT:    s_addc_u32 s12, s14, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX6-NEXT:    s_addc_u32 s13, s13, 0
+; GFX6-NEXT:    s_mul_i32 s11, s10, s11
+; GFX6-NEXT:    s_add_u32 s11, s12, s11
+; GFX6-NEXT:    s_addc_u32 s12, 0, s13
+; GFX6-NEXT:    s_add_u32 s11, s2, s11
+; GFX6-NEXT:    v_mov_b32_e32 v0, s11
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    s_addc_u32 s10, s10, s12
+; GFX6-NEXT:    s_mul_i32 s12, s0, s10
+; GFX6-NEXT:    s_mul_i32 s1, s1, s11
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_add_i32 s12, s13, s12
+; GFX6-NEXT:    s_mul_i32 s0, s0, s11
+; GFX6-NEXT:    s_add_i32 s1, s12, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
+; GFX6-NEXT:    v_mov_b32_e32 v0, s1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT:    s_mul_i32 s13, s11, s1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    s_mul_i32 s0, s10, s0
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX6-NEXT:    s_add_u32 s0, s13, s0
+; GFX6-NEXT:    s_addc_u32 s0, s14, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s1, s10, s1
+; GFX6-NEXT:    s_add_u32 s0, s0, s1
+; GFX6-NEXT:    s_addc_u32 s1, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s11, s0
+; GFX6-NEXT:    s_addc_u32 s13, s10, s1
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s11, s10
 ; GFX6-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s13
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT:    s_mul_i32 s1, s6, s12
+; GFX6-NEXT:    s_mul_i32 s1, s6, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX6-NEXT:    s_add_u32 s1, s14, s1
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s4
-; GFX6-NEXT:    s_mul_i32 s13, s7, s13
+; GFX6-NEXT:    s_mul_i32 s12, s7, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    s_add_u32 s1, s1, s13
+; GFX6-NEXT:    s_add_u32 s1, s1, s12
 ; GFX6-NEXT:    s_addc_u32 s1, s4, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
-; GFX6-NEXT:    s_mul_i32 s12, s7, s12
+; GFX6-NEXT:    s_mul_i32 s12, s7, s13
 ; GFX6-NEXT:    s_add_u32 s12, s1, s12
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
@@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s15, s13, s9
 ; GFX6-NEXT:    s_sub_u32 s16, s6, s8
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s17, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
@@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
 ; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s15, s15, s9
-; GFX6-NEXT:    s_sub_u32 s19, s16, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_subb_u32 s12, s15, s9
+; GFX6-NEXT:    s_sub_u32 s13, s16, s8
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s16
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s14
@@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_sub_u32 s12, 0, s2
-; GFX6-NEXT:    s_subb_u32 s13, 0, s3
+; GFX6-NEXT:    s_sub_u32 s6, 0, s2
+; GFX6-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX6-NEXT:    s_mul_i32 s7, s12, s14
+; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_mul_i32 s15, s13, s6
-; GFX6-NEXT:    s_mul_i32 s16, s12, s6
-; GFX6-NEXT:    s_add_i32 s7, s17, s7
+; GFX6-NEXT:    s_mul_i32 s15, s7, s13
+; GFX6-NEXT:    s_mul_i32 s16, s6, s13
+; GFX6-NEXT:    s_add_i32 s14, s17, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT:    s_add_i32 s7, s7, s15
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s7
+; GFX6-NEXT:    s_add_i32 s14, s14, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s14
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
-; GFX6-NEXT:    s_mul_i32 s18, s6, s7
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s7
+; GFX6-NEXT:    s_mul_i32 s18, s13, s14
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s14
 ; GFX6-NEXT:    s_add_u32 s15, s15, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_mul_i32 s16, s14, s16
+; GFX6-NEXT:    s_mul_i32 s16, s12, s16
 ; GFX6-NEXT:    s_addc_u32 s18, 0, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s17, v4
 ; GFX6-NEXT:    s_add_u32 s15, s15, s16
 ; GFX6-NEXT:    s_addc_u32 s15, s18, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_add_u32 s7, s15, s7
+; GFX6-NEXT:    s_mul_i32 s14, s12, s14
+; GFX6-NEXT:    s_add_u32 s14, s15, s14
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
-; GFX6-NEXT:    s_add_u32 s16, s6, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s14, s14, s15
-; GFX6-NEXT:    s_mul_i32 s6, s12, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
-; GFX6-NEXT:    s_add_i32 s6, s7, s6
-; GFX6-NEXT:    s_mul_i32 s13, s13, s16
-; GFX6-NEXT:    s_mul_i32 s7, s12, s16
-; GFX6-NEXT:    s_add_i32 s6, s6, s13
-; GFX6-NEXT:    v_mov_b32_e32 v2, s7
-; GFX6-NEXT:    v_mov_b32_e32 v0, s6
-; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT:    s_mul_i32 s13, s16, s6
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
+; GFX6-NEXT:    s_add_u32 s13, s13, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    s_addc_u32 s12, s12, s15
+; GFX6-NEXT:    s_mul_i32 s14, s6, s12
+; GFX6-NEXT:    s_mul_i32 s7, s7, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
-; GFX6-NEXT:    s_mul_i32 s7, s14, s7
-; GFX6-NEXT:    s_addc_u32 s15, 0, s15
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX6-NEXT:    s_add_u32 s7, s13, s7
-; GFX6-NEXT:    s_addc_u32 s7, s15, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    s_addc_u32 s12, s12, 0
-; GFX6-NEXT:    s_mul_i32 s6, s14, s6
-; GFX6-NEXT:    s_add_u32 s6, s7, s6
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    s_add_u32 s13, s16, s6
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_addc_u32 s12, s14, s12
+; GFX6-NEXT:    s_add_i32 s14, s15, s14
+; GFX6-NEXT:    s_mul_i32 s6, s6, s13
+; GFX6-NEXT:    s_add_i32 s7, s14, s7
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v0, s7
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mul_i32 s15, s13, s7
+; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
+; GFX6-NEXT:    s_add_u32 s15, s17, s15
+; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    s_mul_i32 s6, s12, s6
+; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v3
+; GFX6-NEXT:    s_add_u32 s6, s15, s6
+; GFX6-NEXT:    s_addc_u32 s6, s16, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_addc_u32 s14, s14, 0
+; GFX6-NEXT:    s_mul_i32 s7, s12, s7
+; GFX6-NEXT:    s_add_u32 s6, s6, s7
+; GFX6-NEXT:    s_addc_u32 s7, 0, s14
+; GFX6-NEXT:    s_add_u32 s13, s13, s6
+; GFX6-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s15, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s14, s15
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s17, s17, s3
-; GFX6-NEXT:    s_sub_u32 s21, s18, s2
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_subb_u32 s14, s17, s3
+; GFX6-NEXT:    s_sub_u32 s15, s18, s2
+; GFX6-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16
@@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    s_sub_u32 s8, 0, s6
-; GFX6-NEXT:    s_subb_u32 s9, 0, s7
+; GFX6-NEXT:    s_sub_u32 s2, 0, s6
+; GFX6-NEXT:    s_subb_u32 s3, 0, s7
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX6-NEXT:    s_mul_i32 s0, s9, s2
-; GFX6-NEXT:    s_add_i32 s1, s3, s1
-; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s13, s8, s2
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    s_mul_i32 s4, s2, s3
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX6-NEXT:    s_mul_i32 s9, s2, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX6-NEXT:    s_mul_i32 s1, s3, s0
+; GFX6-NEXT:    s_add_i32 s9, s12, s9
+; GFX6-NEXT:    s_add_i32 s9, s9, s1
+; GFX6-NEXT:    s_mul_i32 s1, s2, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s9
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT:    s_mul_i32 s12, s0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
-; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT:    s_add_u32 s4, s16, s4
-; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s13, s12, s13
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s9
+; GFX6-NEXT:    s_add_u32 s12, s16, s12
+; GFX6-NEXT:    s_addc_u32 s13, 0, s13
+; GFX6-NEXT:    s_mul_i32 s1, s8, s1
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s13
-; GFX6-NEXT:    s_addc_u32 s4, s5, s16
-; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s12, s3
-; GFX6-NEXT:    s_add_u32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s4, 0, s5
-; GFX6-NEXT:    s_add_u32 s5, s2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s4, s12, s4
-; GFX6-NEXT:    s_mul_i32 s2, s8, s4
-; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX6-NEXT:    s_add_i32 s2, s3, s2
-; GFX6-NEXT:    s_mul_i32 s9, s9, s5
-; GFX6-NEXT:    s_mul_i32 s3, s8, s5
-; GFX6-NEXT:    s_add_i32 s2, s2, s9
-; GFX6-NEXT:    v_mov_b32_e32 v2, s3
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    s_add_u32 s1, s12, s1
+; GFX6-NEXT:    s_addc_u32 s1, s13, s16
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX6-NEXT:    s_addc_u32 s12, s12, 0
+; GFX6-NEXT:    s_mul_i32 s9, s8, s9
+; GFX6-NEXT:    s_add_u32 s1, s1, s9
+; GFX6-NEXT:    s_addc_u32 s9, 0, s12
+; GFX6-NEXT:    s_add_u32 s12, s0, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s12
+; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_addc_u32 s4, s8, s9
+; GFX6-NEXT:    s_mul_i32 s5, s2, s4
+; GFX6-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX6-NEXT:    s_add_i32 s5, s8, s5
+; GFX6-NEXT:    s_mul_i32 s3, s3, s12
+; GFX6-NEXT:    s_mul_i32 s2, s2, s12
+; GFX6-NEXT:    s_add_i32 s3, s5, s3
+; GFX6-NEXT:    v_mov_b32_e32 v2, s2
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    s_mul_i32 s9, s5, s2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_mul_i32 s8, s12, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX6-NEXT:    s_add_u32 s9, s13, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s12, 0, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
-; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s12, s8
-; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX6-NEXT:    s_addc_u32 s8, s8, 0
+; GFX6-NEXT:    s_add_u32 s8, s13, s8
+; GFX6-NEXT:    v_readfirstlane_b32 s9, v0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
-; GFX6-NEXT:    s_add_u32 s2, s3, s2
-; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s12, s5, s2
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_addc_u32 s13, s4, s8
+; GFX6-NEXT:    s_addc_u32 s9, 0, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX6-NEXT:    s_add_u32 s2, s8, s2
+; GFX6-NEXT:    s_addc_u32 s2, s9, s5
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX6-NEXT:    s_addc_u32 s5, s5, 0
+; GFX6-NEXT:    s_mul_i32 s3, s4, s3
+; GFX6-NEXT:    s_add_u32 s2, s2, s3
+; GFX6-NEXT:    s_addc_u32 s3, 0, s5
+; GFX6-NEXT:    s_add_u32 s12, s12, s2
+; GFX6-NEXT:    s_addc_u32 s13, s4, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
@@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s13, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s17, s12, s7
 ; GFX6-NEXT:    s_sub_u32 s18, s8, s6
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s19, s12, s13
 ; GFX6-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
 ; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s17, s17, s7
-; GFX6-NEXT:    s_sub_u32 s21, s18, s6
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_subb_u32 s12, s17, s7
+; GFX6-NEXT:    s_sub_u32 s13, s18, s6
+; GFX6-NEXT:    s_subb_u32 s12, s12, 0
 ; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s13, s13, s18
 ; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
 ; GFX6-NEXT:    s_subb_u32 s9, s9, s16
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..8d05317162e9c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_mov_b32 s10, -1
 ; CISI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
-; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT:    s_or_b32 s6, s12, s13
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e3c533d..71af21a11c2ce 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
-; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -88,15 +86,13 @@ bb:
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX7-NEXT:    s_add_i32 s12, s12, s17
 ; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_u32 s0, s2, s2
-; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_addc_u32 s0, s2, 0
+; GFX7-NEXT:    s_add_u32 s1, s0, s0
+; GFX7-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94a7f245..74a6d7fe39362 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s8, s1, 31
 ; GCN-NEXT:    s_add_u32 s0, s0, s8
@@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT:    s_sub_u32 s12, 0, s10
-; GCN-NEXT:    s_subb_u32 s13, 0, s11
+; GCN-NEXT:    s_sub_u32 s0, 0, s10
+; GCN-NEXT:    s_subb_u32 s1, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_mul_i32 s15, s13, s0
-; GCN-NEXT:    s_mul_i32 s16, s12, s0
-; GCN-NEXT:    s_add_i32 s1, s17, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s16
-; GCN-NEXT:    s_add_i32 s1, s1, s15
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s16
-; GCN-NEXT:    v_readfirstlane_b32 s15, v3
-; GCN-NEXT:    s_mul_i32 s17, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s15, s15, s17
-; GCN-NEXT:    v_readfirstlane_b32 s17, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s17
-; GCN-NEXT:    s_mul_i32 s16, s14, s16
-; GCN-NEXT:    v_readfirstlane_b32 s18, v4
-; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_addc_u32 s15, s17, s18
-; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_addc_u32 s16, s16, 0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_add_u32 s1, s15, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s16
-; GCN-NEXT:    s_add_u32 s16, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s15
-; GCN-NEXT:    s_mul_i32 s0, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s1, s12, s16
-; GCN-NEXT:    s_add_i32 s0, s0, s13
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s16, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s14, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GCN-NEXT:    s_mul_i32 s13, s16, s0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v2
-; GCN-NEXT:    s_add_u32 s13, s17, s13
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s1, s14, s1
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s1, s15, s12
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    s_addc_u32 s12, s12, 0
-; GCN-NEXT:    s_mul_i32 s0, s14, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    s_add_u32 s15, s16, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s14, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s13, s0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s16, v2
+; GCN-NEXT:    s_mul_i32 s14, s1, s2
+; GCN-NEXT:    s_mul_i32 s15, s0, s2
+; GCN-NEXT:    s_add_i32 s13, s16, s13
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s15
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s15
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_mul_i32 s16, s2, s13
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s13
+; GCN-NEXT:    s_add_u32 s14, s14, s16
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s15, s12, s15
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s17, v4
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s14, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s15, s15, 0
+; GCN-NEXT:    s_mul_i32 s13, s12, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
+; GCN-NEXT:    s_addc_u32 s14, 0, s15
+; GCN-NEXT:    s_add_u32 s13, s2, s13
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s12, s12, s14
+; GCN-NEXT:    s_mul_i32 s14, s0, s12
+; GCN-NEXT:    s_mul_i32 s1, s1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v0
+; GCN-NEXT:    s_add_i32 s14, s15, s14
+; GCN-NEXT:    s_mul_i32 s0, s0, s13
+; GCN-NEXT:    s_add_i32 s1, s14, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT:    s_mul_i32 s15, s13, s1
+; GCN-NEXT:    v_readfirstlane_b32 s17, v2
+; GCN-NEXT:    s_add_u32 s15, s17, s15
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s0, s12, s0
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    v_readfirstlane_b32 s14, v3
+; GCN-NEXT:    s_add_u32 s0, s15, s0
+; GCN-NEXT:    s_addc_u32 s0, s16, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v1
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s1, s12, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s14
+; GCN-NEXT:    s_add_u32 s14, s13, s0
+; GCN-NEXT:    s_addc_u32 s15, s12, s1
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    s_addc_u32 s1, s7, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s7, v2
-; GCN-NEXT:    s_mul_i32 s1, s6, s14
+; GCN-NEXT:    s_mul_i32 s1, s6, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v3
 ; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    s_add_u32 s1, s16, s1
 ; GCN-NEXT:    s_addc_u32 s4, 0, s4
-; GCN-NEXT:    s_mul_i32 s15, s7, s15
+; GCN-NEXT:    s_mul_i32 s14, s7, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v1
-; GCN-NEXT:    s_add_u32 s1, s1, s15
+; GCN-NEXT:    s_add_u32 s1, s1, s14
 ; GCN-NEXT:    s_addc_u32 s1, s4, s16
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
-; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s16, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    s_mul_i32 s14, s7, s15
+; GCN-NEXT:    s_add_u32 s14, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s17, 0, s4
+; GCN-NEXT:    s_addc_u32 s15, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s17
+; GCN-NEXT:    s_mul_i32 s4, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s16
-; GCN-NEXT:    s_add_i32 s18, s4, s5
-; GCN-NEXT:    s_sub_i32 s14, s7, s18
-; GCN-NEXT:    s_mul_i32 s4, s10, s16
+; GCN-NEXT:    s_mul_i32 s5, s11, s14
+; GCN-NEXT:    s_add_i32 s16, s4, s5
+; GCN-NEXT:    s_sub_i32 s17, s7, s16
+; GCN-NEXT:    s_mul_i32 s4, s10, s14
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s4, s5
-; GCN-NEXT:    s_subb_u32 s19, s14, s11
-; GCN-NEXT:    s_sub_u32 s20, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s14, s15
-; GCN-NEXT:    s_subb_u32 s14, s19, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, s11
+; GCN-NEXT:    s_sub_u32 s18, s6, s10
+; GCN-NEXT:    s_subb_u32 s17, s17, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s11
 ; GCN-NEXT:    s_cselect_b32 s19, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s11
-; GCN-NEXT:    s_cselect_b32 s14, s19, s15
-; GCN-NEXT:    s_add_u32 s15, s16, 1
-; GCN-NEXT:    s_addc_u32 s19, s17, 0
-; GCN-NEXT:    s_add_u32 s20, s16, 2
-; GCN-NEXT:    s_addc_u32 s21, s17, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s14, s20, s15
-; GCN-NEXT:    s_cselect_b32 s15, s21, s19
+; GCN-NEXT:    s_cmp_ge_u32 s18, s10
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s11
+; GCN-NEXT:    s_cselect_b32 s17, s18, s19
+; GCN-NEXT:    s_add_u32 s18, s14, 1
+; GCN-NEXT:    s_addc_u32 s19, s15, 0
+; GCN-NEXT:    s_add_u32 s20, s14, 2
+; GCN-NEXT:    s_addc_u32 s21, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s17, 0
+; GCN-NEXT:    s_cselect_b32 s17, s20, s18
+; GCN-NEXT:    s_cselect_b32 s18, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_subb_u32 s4, s7, s18
+; GCN-NEXT:    s_subb_u32 s4, s7, s16
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
@@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s4, s6, s5
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s15, s17
-; GCN-NEXT:    s_cselect_b32 s4, s14, s16
+; GCN-NEXT:    s_cselect_b32 s5, s18, s15
+; GCN-NEXT:    s_cselect_b32 s4, s17, s14
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
 ; GCN-NEXT:    s_subb_u32 s5, s5, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s16, s16, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, s17, s21
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s2, 0, s6
-; GCN-NEXT:    s_subb_u32 s10, 0, s7
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s8, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_mul_i32 s12, s10, s8
-; GCN-NEXT:    s_mul_i32 s13, s2, s8
-; GCN-NEXT:    s_add_i32 s9, s14, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
-; GCN-NEXT:    s_add_i32 s9, s9, s12
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
-; GCN-NEXT:    s_add_u32 s12, s12, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s13, s11, s13
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s14, v4
-; GCN-NEXT:    s_add_u32 s12, s12, s13
-; GCN-NEXT:    s_addc_u32 s12, s15, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s9, s11, s9
-; GCN-NEXT:    s_add_u32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s12, 0, s13
-; GCN-NEXT:    s_add_u32 s13, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s10, s2, s9
+; GCN-NEXT:    v_readfirstlane_b32 s13, v2
+; GCN-NEXT:    s_mul_i32 s11, s8, s3
+; GCN-NEXT:    s_mul_i32 s12, s2, s3
+; GCN-NEXT:    s_add_i32 s10, s13, s10
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
+; GCN-NEXT:    s_add_i32 s10, s10, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s10
+; GCN-NEXT:    s_mul_i32 s14, s3, s10
+; GCN-NEXT:    s_add_u32 s11, s11, s14
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s12, s9, s12
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v4
+; GCN-NEXT:    s_add_u32 s11, s11, s12
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    s_addc_u32 s11, s14, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, 0
+; GCN-NEXT:    s_mul_i32 s10, s9, s10
+; GCN-NEXT:    s_add_u32 s10, s11, s10
+; GCN-NEXT:    s_addc_u32 s11, 0, s12
+; GCN-NEXT:    s_add_u32 s10, s3, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s11, s11, s12
-; GCN-NEXT:    s_mul_i32 s8, s2, s11
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s10, s10, s13
-; GCN-NEXT:    s_mul_i32 s2, s2, s13
-; GCN-NEXT:    s_add_i32 s8, s8, s10
+; GCN-NEXT:    s_addc_u32 s9, s9, s11
+; GCN-NEXT:    s_mul_i32 s11, s2, s9
+; GCN-NEXT:    s_mul_i32 s8, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_add_i32 s11, s12, s11
+; GCN-NEXT:    s_mul_i32 s2, s2, s10
+; GCN-NEXT:    s_add_i32 s8, s11, s8
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s13, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GCN-NEXT:    s_mul_i32 s10, s13, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_mul_i32 s12, s10, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v2
-; GCN-NEXT:    s_add_u32 s10, s14, s10
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s2, s11, s2
-; GCN-NEXT:    s_addc_u32 s12, 0, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v3
-; GCN-NEXT:    s_add_u32 s2, s10, s2
-; GCN-NEXT:    s_addc_u32 s2, s12, s9
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-NEXT:    s_mul_i32 s8, s11, s8
+; GCN-NEXT:    s_add_u32 s12, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s13, v0
+; GCN-NEXT:    s_mul_i32 s2, s9, s2
+; GCN-NEXT:    s_addc_u32 s13, 0, s13
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    s_add_u32 s2, s12, s2
+; GCN-NEXT:    s_addc_u32 s2, s13, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-NEXT:    s_mul_i32 s8, s9, s8
 ; GCN-NEXT:    s_add_u32 s2, s2, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    s_add_u32 s2, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s8, s11, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s11
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s8, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
 ; GCN-NEXT:    s_mul_i32 s8, s8, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s12, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    s_addc_u32 s10, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s12
+; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s13, s9, s8
-; GCN-NEXT:    s_sub_i32 s10, 0, s13
-; GCN-NEXT:    s_mul_i32 s8, s6, s12
-; GCN-NEXT:    s_sub_u32 s14, 24, s8
+; GCN-NEXT:    s_add_i32 s11, s9, s8
+; GCN-NEXT:    s_sub_i32 s12, 0, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s10
+; GCN-NEXT:    s_sub_u32 s13, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s8, s9
-; GCN-NEXT:    s_subb_u32 s15, s10, s7
-; GCN-NEXT:    s_sub_u32 s16, s14, s6
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s15, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, s7
+; GCN-NEXT:    s_sub_u32 s14, s13, s6
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s7
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s10, s7
-; GCN-NEXT:    s_cselect_b32 s10, s15, s11
-; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cselect_b32 s14, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s12, s7
+; GCN-NEXT:    s_cselect_b32 s12, s14, s15
+; GCN-NEXT:    s_add_u32 s14, s10, 1
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_add_u32 s16, s10, 2
 ; GCN-NEXT:    s_addc_u32 s17, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_cselect_b32 s10, s16, s11
-; GCN-NEXT:    s_cselect_b32 s11, s17, s15
+; GCN-NEXT:    s_cmp_lg_u32 s12, 0
+; GCN-NEXT:    s_cselect_b32 s12, s16, s14
+; GCN-NEXT:    s_cselect_b32 s14, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, 0, s13
+; GCN-NEXT:    s_subb_u32 s8, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s6
+; GCN-NEXT:    s_cmp_ge_u32 s13, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s11, 0
-; GCN-NEXT:    s_cselect_b32 s6, s10, s12
+; GCN-NEXT:    s_cselect_b32 s7, s14, 0
+; GCN-NEXT:    s_cselect_b32 s6, s12, s10
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb0417dfa4..862e2dd2de051 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GCN-NEXT:    s_sub_u32 s10, 0, s4
-; GCN-NEXT:    s_subb_u32 s11, 0, s5
+; GCN-NEXT:    s_sub_u32 s8, 0, s4
+; GCN-NEXT:    s_subb_u32 s9, 0, s5
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_mul_i32 s9, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s8
-; GCN-NEXT:    s_mul_i32 s14, s10, s8
-; GCN-NEXT:    s_add_i32 s9, s15, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s9, s9, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s8, s9
-; GCN-NEXT:    s_add_u32 s13, s13, s15
-; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, v1, s9
-; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s8, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s9, s2
+; GCN-NEXT:    s_mul_i32 s13, s8, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s14, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s13, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s8, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s8, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s8, s9, s8
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s9, s10, s14
-; GCN-NEXT:    s_add_i32 s8, s8, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s8
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
+; GCN-NEXT:    v_readfirstlane_b32 s15, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s8, s10
+; GCN-NEXT:    s_mul_i32 s9, s9, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s9, s12, s9
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s9, s11, s9
-; GCN-NEXT:    s_addc_u32 s9, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s8, s12, s8
-; GCN-NEXT:    s_add_u32 s8, s9, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_addc_u32 s10, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s8, s8, s11
+; GCN-NEXT:    s_add_i32 s9, s12, s9
+; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s8, s10, s8
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s8, s13, s8
+; GCN-NEXT:    s_addc_u32 s8, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s9, s10, s9
+; GCN-NEXT:    s_add_u32 s8, s8, s9
+; GCN-NEXT:    s_addc_u32 s9, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s8
+; GCN-NEXT:    s_addc_u32 s10, s10, s9
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
 ; GCN-NEXT:    s_mov_b32 s9, s8
@@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_addc_u32 s11, 0, s12
 ; GCN-NEXT:    s_mul_i32 s11, s4, s11
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
@@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s13, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s12, s5
 ; GCN-NEXT:    s_sub_u32 s16, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s17, s12, s13
 ; GCN-NEXT:    s_subb_u32 s17, s15, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, -1, 0
@@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_cmp_eq_u32 s17, s5
 ; GCN-NEXT:    s_cselect_b32 s18, s19, s18
 ; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s15, s15, s5
-; GCN-NEXT:    s_sub_u32 s19, s16, s4
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s12, s13
-; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_subb_u32 s12, s15, s5
+; GCN-NEXT:    s_sub_u32 s13, s16, s4
+; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s13, s13, s16
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-NEXT:    s_subb_u32 s7, s7, s14
@@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_sub_u32 s14, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, s15, s21
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GCN-NEXT:    s_sub_u32 s2, 0, s4
-; GCN-NEXT:    s_subb_u32 s8, 0, s5
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_subb_u32 s6, 0, s5
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v0
-; GCN-NEXT:    s_mul_i32 s7, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s6
-; GCN-NEXT:    s_mul_i32 s11, s2, s6
-; GCN-NEXT:    s_add_i32 s7, s12, s7
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s7, s7, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_mul_i32 s13, s6, s7
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s7
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    v_readfirstlane_b32 s11, v1
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s7, s9, s7
-; GCN-NEXT:    s_add_u32 s7, s10, s7
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s6, s7
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_readfirstlane_b32 s3, v0
+; GCN-NEXT:    s_mul_i32 s8, s2, s7
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s6, s3
+; GCN-NEXT:    s_mul_i32 s10, s2, s3
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_mul_i32 s12, s3, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s7, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_mul_i32 s8, s7, s8
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s3, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s6, s2, s9
-; GCN-NEXT:    v_readfirstlane_b32 s7, v0
-; GCN-NEXT:    s_add_i32 s6, s7, s6
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s2, s2, s11
-; GCN-NEXT:    s_add_i32 s6, s6, s8
+; GCN-NEXT:    s_addc_u32 s7, s7, s9
+; GCN-NEXT:    s_mul_i32 s9, s2, s7
+; GCN-NEXT:    s_mul_i32 s6, s6, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s2, s2, s8
+; GCN-NEXT:    s_add_i32 s6, s9, s6
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s6
+; GCN-NEXT:    v_mul_hi_u32 v3, s7, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s2, s9, s2
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s7, v3
-; GCN-NEXT:    s_add_u32 s2, s8, s2
-; GCN-NEXT:    s_addc_u32 s2, s10, s7
-; GCN-NEXT:    v_readfirstlane_b32 s7, v1
-; GCN-NEXT:    s_addc_u32 s7, s7, 0
-; GCN-NEXT:    s_mul_i32 s6, s9, s6
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s2, s7, s2
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s2, s10, s2
+; GCN-NEXT:    s_addc_u32 s2, s11, s9
+; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s6, s7, s6
 ; GCN-NEXT:    s_add_u32 s2, s2, s6
-; GCN-NEXT:    s_addc_u32 s8, 0, s7
-; GCN-NEXT:    s_add_u32 s2, s11, s2
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_addc_u32 s6, s9, s8
+; GCN-NEXT:    s_addc_u32 s6, 0, s9
+; GCN-NEXT:    s_add_u32 s2, s8, s2
+; GCN-NEXT:    s_addc_u32 s6, s7, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
 ; GCN-NEXT:    s_mul_i32 s6, s6, 24
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
 ; GCN-NEXT:    s_add_u32 s6, s8, s6
@@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    s_add_i32 s10, s8, s7
 ; GCN-NEXT:    s_sub_i32 s8, 0, s10
 ; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s6, s7
 ; GCN-NEXT:    s_subb_u32 s12, s8, s5
 ; GCN-NEXT:    s_sub_u32 s13, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s5
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s5
-; GCN-NEXT:    s_sub_u32 s16, s13, s4
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s5
+; GCN-NEXT:    s_sub_u32 s9, s13, s4
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-NEXT:    s_subb_u32 s6, 0, s10
@@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25e91c8..b000fae124ede 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_add_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_add_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac80ea55..775483c040b7f 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s10, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s10
+; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s11, s1, s0
-; GCN-NEXT:    s_sub_i32 s8, 0, s11
-; GCN-NEXT:    s_mul_i32 s0, s2, s10
-; GCN-NEXT:    s_sub_u32 s12, 24, s0
+; GCN-NEXT:    s_add_i32 s9, s1, s0
+; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_mul_i32 s0, s2, s8
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s9, s0, s1
-; GCN-NEXT:    s_subb_u32 s13, s8, s3
-; GCN-NEXT:    s_sub_u32 s14, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, s3
+; GCN-NEXT:    s_sub_u32 s12, s11, s2
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s3
 ; GCN-NEXT:    s_cselect_b32 s13, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s3
-; GCN-NEXT:    s_cselect_b32 s8, s13, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s3
+; GCN-NEXT:    s_cselect_b32 s10, s12, s13
+; GCN-NEXT:    s_add_u32 s12, s8, 1
 ; GCN-NEXT:    s_addc_u32 s13, 0, 0
-; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_add_u32 s14, s8, 2
 ; GCN-NEXT:    s_addc_u32 s15, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s14, s9
-; GCN-NEXT:    s_cselect_b32 s9, s15, s13
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s14, s12
+; GCN-NEXT:    s_cselect_b32 s12, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_subb_u32 s0, 0, s11
+; GCN-NEXT:    s_subb_u32 s0, 0, s9
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s9, 0
-; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    s_cselect_b32 s0, s12, 0
+; GCN-NEXT:    s_cselect_b32 s1, s10, s8
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB11_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s8
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1fe42294..28e6627b87413 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s10, 0, s8
-; GCN-NEXT:    s_subb_u32 s11, 0, s9
+; GCN-NEXT:    s_sub_u32 s0, 0, s8
+; GCN-NEXT:    s_subb_u32 s1, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT:    v_readfirstlane_b32 s12, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_mul_i32 s1, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_mul_i32 s13, s11, s0
-; GCN-NEXT:    s_mul_i32 s14, s10, s0
-; GCN-NEXT:    s_add_i32 s1, s15, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT:    s_add_i32 s1, s1, s13
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT:    v_readfirstlane_b32 s13, v3
-; GCN-NEXT:    s_mul_i32 s15, s0, s1
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT:    s_add_u32 s13, s13, s15
+; GCN-NEXT:    v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT:    v_readfirstlane_b32 s10, v1
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_mul_i32 s11, s0, s10
+; GCN-NEXT:    v_readfirstlane_b32 s14, v2
+; GCN-NEXT:    s_mul_i32 s12, s1, s2
+; GCN-NEXT:    s_mul_i32 s13, s0, s2
+; GCN-NEXT:    s_add_i32 s11, s14, s11
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    s_add_i32 s11, s11, s12
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_mul_i32 s15, s2, s11
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT:    s_add_u32 s12, s12, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s15, v0
-; GCN-NEXT:    s_mul_i32 s14, s12, s14
+; GCN-NEXT:    s_mul_i32 s13, s10, s13
 ; GCN-NEXT:    s_addc_u32 s15, 0, s15
-; GCN-NEXT:    v_readfirstlane_b32 s16, v4
-; GCN-NEXT:    s_add_u32 s13, s13, s14
-; GCN-NEXT:    s_addc_u32 s13, s15, s16
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s14, s14, 0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_add_u32 s1, s13, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s14
-; GCN-NEXT:    s_add_u32 s14, s0, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s12, s12, s13
-; GCN-NEXT:    s_mul_i32 s0, s10, s12
-; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s0, s1, s0
-; GCN-NEXT:    s_mul_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s1, s10, s14
-; GCN-NEXT:    s_add_i32 s0, s0, s11
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT:    s_mul_i32 s11, s14, s0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v2
-; GCN-NEXT:    s_add_u32 s11, s15, s11
+; GCN-NEXT:    v_readfirstlane_b32 s14, v4
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s12, s15, s14
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s13, s13, 0
+; GCN-NEXT:    s_mul_i32 s11, s10, s11
+; GCN-NEXT:    s_add_u32 s11, s12, s11
+; GCN-NEXT:    s_addc_u32 s12, 0, s13
+; GCN-NEXT:    s_add_u32 s11, s2, s11
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    s_addc_u32 s10, s10, s12
+; GCN-NEXT:    s_mul_i32 s12, s0, s10
+; GCN-NEXT:    s_mul_i32 s1, s1, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s1, s12, s1
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    s_add_u32 s1, s11, s1
-; GCN-NEXT:    s_addc_u32 s1, s13, s10
-; GCN-NEXT:    v_readfirstlane_b32 s10, v1
-; GCN-NEXT:    s_addc_u32 s10, s10, 0
-; GCN-NEXT:    s_mul_i32 s0, s12, s0
-; GCN-NEXT:    s_add_u32 s0, s1, s0
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    s_add_u32 s11, s14, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_addc_u32 s1, s12, s10
+; GCN-NEXT:    s_add_i32 s12, s13, s12
+; GCN-NEXT:    s_mul_i32 s0, s0, s11
+; GCN-NEXT:    s_add_i32 s1, s12, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mul_i32 s13, s11, s1
+; GCN-NEXT:    v_readfirstlane_b32 s15, v2
+; GCN-NEXT:    s_add_u32 s13, s15, s13
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s0, s10, s0
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    v_readfirstlane_b32 s12, v3
+; GCN-NEXT:    s_add_u32 s0, s13, s0
+; GCN-NEXT:    s_addc_u32 s0, s14, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v1
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s1, s10, s1
+; GCN-NEXT:    s_add_u32 s0, s0, s1
+; GCN-NEXT:    s_addc_u32 s1, 0, s12
+; GCN-NEXT:    s_add_u32 s11, s11, s0
+; GCN-NEXT:    s_addc_u32 s1, s10, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s4, s5
 ; GCN-NEXT:    s_subb_u32 s13, s10, s9
 ; GCN-NEXT:    s_sub_u32 s14, s6, s8
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s15, s10, s11
 ; GCN-NEXT:    s_subb_u32 s15, s13, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_eq_u32 s15, s9
 ; GCN-NEXT:    s_cselect_b32 s16, s17, s16
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s13, s13, s9
-; GCN-NEXT:    s_sub_u32 s17, s14, s8
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_subb_u32 s10, s13, s9
+; GCN-NEXT:    s_sub_u32 s11, s14, s8
+; GCN-NEXT:    s_subb_u32 s10, s10, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s16, 0
-; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s11, s11, s14
 ; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
 ; GCN-NEXT:    s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_cmp_lg_u32 s5, 0
 ; GCN-NEXT:    s_cselect_b32 s4, s10, s4
 ; GCN-NEXT:    s_cselect_b32 s5, s11, s6
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_sub_u32 s12, s12, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, s13, s19
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-LABEL: s_test_urem_k_num_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s6, 0, s2
-; GCN-NEXT:    s_subb_u32 s8, 0, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT:    v_readfirstlane_b32 s6, v1
+; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+; GCN-NEXT:    s_mul_i32 s8, s4, s6
+; GCN-NEXT:    v_readfirstlane_b32 s11, v2
+; GCN-NEXT:    s_mul_i32 s9, s5, s7
+; GCN-NEXT:    s_mul_i32 s10, s4, s7
+; GCN-NEXT:    s_add_i32 s8, s11, s8
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT:    s_add_i32 s8, s8, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_mul_i32 s12, s7, s8
+; GCN-NEXT:    v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT:    s_add_u32 s9, s9, s12
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s10, s6, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    v_readfirstlane_b32 s11, v4
+; GCN-NEXT:    s_add_u32 s9, s9, s10
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    s_addc_u32 s9, s12, s11
+; GCN-NEXT:    s_mul_i32 s8, s6, s8
+; GCN-NEXT:    s_addc_u32 s10, s13, 0
+; GCN-NEXT:    s_add_u32 s8, s9, s8
+; GCN-NEXT:    s_addc_u32 s9, 0, s10
+; GCN-NEXT:    s_add_u32 s8, s7, s8
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    s_addc_u32 s6, s6, s9
+; GCN-NEXT:    s_mul_i32 s9, s4, s6
+; GCN-NEXT:    s_mul_i32 s5, s5, s8
+; GCN-NEXT:    v_readfirstlane_b32 s10, v0
+; GCN-NEXT:    s_add_i32 s9, s10, s9
+; GCN-NEXT:    s_mul_i32 s4, s4, s8
+; GCN-NEXT:    s_add_i32 s5, s9, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT:    v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT:    s_mul_i32 s10, s8, s5
+; GCN-NEXT:    v_readfirstlane_b32 s12, v2
+; GCN-NEXT:    s_add_u32 s10, s12, s10
+; GCN-NEXT:    v_readfirstlane_b32 s11, v0
+; GCN-NEXT:    s_mul_i32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s11, 0, s11
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    s_add_u32 s4, s10, s4
+; GCN-NEXT:    s_addc_u32 s4, s11, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_mul_i32 s5, s6, s5
+; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_addc_u32 s5, 0, s9
+; GCN-NEXT:    s_add_u32 s4, s8, s4
+; GCN-NEXT:    s_addc_u32 s5, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT:    s_mul_i32 s5, s5, 24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
-; GCN-NEXT:    s_mul_i32 s5, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_mul_i32 s10, s8, s4
-; GCN-NEXT:    s_mul_i32 s11, s6, s4
-; GCN-NEXT:    s_add_i32 s5, s12, s5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT:    s_add_i32 s5, s5, s10
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT:    v_readfirstlane_b32 s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT:    s_mul_i32 s13, s4, s5
-; GCN-NEXT:    s_add_u32 s10, s10, s13
-; GCN-NEXT:    v_readfirstlane_b32 s13, v0
-; GCN-NEXT:    s_mul_i32 s11, s9, s11
-; GCN-NEXT:    s_addc_u32 s13, 0, s13
-; GCN-NEXT:    v_readfirstlane_b32 s12, v4
-; GCN-NEXT:    s_add_u32 s10, s10, s11
-; GCN-NEXT:    v_readfirstlane_b32 s14, v1
-; GCN-NEXT:    s_addc_u32 s10, s13, s12
-; GCN-NEXT:    s_addc_u32 s11, s14, 0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_add_u32 s5, s10, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s11
-; GCN-NEXT:    s_add_u32 s11, s4, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s9, s9, s10
-; GCN-NEXT:    s_mul_i32 s4, s6, s9
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s8, s8, s11
-; GCN-NEXT:    s_mul_i32 s5, s6, s11
-; GCN-NEXT:    s_add_i32 s4, s4, s8
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mul_i32 s8, s11, s4
-; GCN-NEXT:    v_readfirstlane_b32 s12, v2
-; GCN-NEXT:    s_add_u32 s8, s12, s8
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s5, s9, s5
-; GCN-NEXT:    s_addc_u32 s10, 0, s10
-; GCN-NEXT:    v_readfirstlane_b32 s6, v3
 ; GCN-NEXT:    s_add_u32 s5, s8, s5
-; GCN-NEXT:    s_addc_u32 s5, s10, s6
-; GCN-NEXT:    v_readfirstlane_b32 s6, v1
-; GCN-NEXT:    s_addc_u32 s6, s6, 0
-; GCN-NEXT:    s_mul_i32 s4, s9, s4
-; GCN-NEXT:    s_add_u32 s4, s5, s4
-; GCN-NEXT:    s_addc_u32 s6, 0, s6
-; GCN-NEXT:    s_add_u32 s8, s11, s4
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_addc_u32 s4, s9, s6
-; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT:    s_mul_i32 s4, s4, 24
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v1
-; GCN-NEXT:    v_readfirstlane_b32 s5, v0
-; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
+; GCN-NEXT:    s_addc_u32 s8, 0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
@@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
 ; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s0, s1
 ; GCN-NEXT:    s_subb_u32 s12, s9, s3
 ; GCN-NEXT:    s_sub_u32 s13, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
 ; GCN-NEXT:    s_subb_u32 s14, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, -1, 0
@@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_eq_u32 s14, s3
 ; GCN-NEXT:    s_cselect_b32 s15, s16, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s12, s12, s3
-; GCN-NEXT:    s_sub_u32 s16, s13, s2
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, s12, s3
+; GCN-NEXT:    s_sub_u32 s9, s13, s2
+; GCN-NEXT:    s_subb_u32 s8, s8, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s9, s9, s13
 ; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_subb_u32 s0, 0, s10
@@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_cmp_lg_u32 s1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, s8, s0
 ; GCN-NEXT:    s_cselect_b32 s1, s9, s11
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s17
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
 ; GCN-IR-NEXT:    s_subb_u32 s9, s9, 0
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db6471b6a46..8a54ad301f48a 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_sub_u32 s2, s2, s8
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
+; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_mov_b32 s10, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
-; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT:    s_or_b32 s6, s12, s13
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1



More information about the llvm-commits mailing list