[llvm] [AMDGPU] Delete redundant s_or_b32 (PR #165261)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 5 06:26:25 PST 2025
https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/165261
>From d9321dd183919007a5430d758360c8470f9c7818 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 22 Oct 2025 12:59:54 -0500
Subject: [PATCH 01/13] Delete redundant s_or_b32
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 21 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 941 +++++++++---------
.../test/CodeGen/AMDGPU/carryout-selection.ll | 4 -
.../expand-scalar-carry-out-select-user.ll | 10 +-
llvm/test/CodeGen/AMDGPU/sdiv64.ll | 368 ++++---
llvm/test/CodeGen/AMDGPU/srem64.ll | 410 ++++----
llvm/test/CodeGen/AMDGPU/uaddo.ll | 6 +-
llvm/test/CodeGen/AMDGPU/udiv64.ll | 199 ++--
llvm/test/CodeGen/AMDGPU/urem64.ll | 296 +++---
llvm/test/CodeGen/AMDGPU/usubo.ll | 6 +-
11 files changed, 1058 insertions(+), 1205 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d9f76c9a59d00..305c9c40ab726 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10160,7 +10160,7 @@ static bool followSubRegDef(MachineInstr &MI,
}
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
@@ -10689,6 +10689,25 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;
+ // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
+ // a register pair) and the input is a 64-bit foldableSelect then transform:
+ //
+ // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64
+ // (non-zero
+ // imm), 0)
+ if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
+ MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
+ MachineOperand OrOpnd1 = Def->getOperand(1);
+ MachineOperand OrOpnd2 = Def->getOperand(2);
+
+ if (OrOpnd1.isReg() && OrOpnd2.isReg() &&
+ OrOpnd1.getReg() != OrOpnd2.getReg()) {
+ auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI);
+ auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI);
+ if (Def1 == Def2 && foldableSelect(Def1))
+ optimizeSCC(Def1, Def);
+ }
+ }
return true;
};
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21f959ce..c4d0678c0f989 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
- MachineRegisterInfo &MRI);
+ const MachineRegisterInfo &MRI);
/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c34cc55e..54b1554ae5d04 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX6-NEXT: s_ashr_i32 s8, s1, 31
@@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT: s_sub_u32 s12, 0, s10
-; GFX6-NEXT: s_subb_u32 s13, 0, s11
+; GFX6-NEXT: s_sub_u32 s0, 0, s10
+; GFX6-NEXT: s_subb_u32 s1, 0, s11
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_mul_i32 s15, s13, s0
-; GFX6-NEXT: s_mul_i32 s16, s12, s0
-; GFX6-NEXT: s_add_i32 s1, s17, s1
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT: s_add_i32 s1, s1, s15
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
-; GFX6-NEXT: s_mul_i32 s17, s0, s1
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT: s_add_u32 s15, s15, s17
-; GFX6-NEXT: v_readfirstlane_b32 s17, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s17
-; GFX6-NEXT: s_mul_i32 s16, s14, s16
-; GFX6-NEXT: v_readfirstlane_b32 s18, v4
-; GFX6-NEXT: s_add_u32 s15, s15, s16
-; GFX6-NEXT: s_addc_u32 s15, s17, s18
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: s_addc_u32 s16, s16, 0
-; GFX6-NEXT: s_mul_i32 s1, s14, s1
-; GFX6-NEXT: s_add_u32 s1, s15, s1
-; GFX6-NEXT: s_addc_u32 s15, 0, s16
-; GFX6-NEXT: s_add_u32 s16, s0, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s14, s14, s15
-; GFX6-NEXT: s_mul_i32 s0, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s1, v0
-; GFX6-NEXT: s_add_i32 s0, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s13, s16
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
-; GFX6-NEXT: s_add_i32 s0, s0, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s0
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_mul_i32 s1, s14, s1
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s1, s13, s1
-; GFX6-NEXT: s_addc_u32 s1, s15, s12
+; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
-; GFX6-NEXT: s_mul_i32 s0, s14, s0
-; GFX6-NEXT: s_add_u32 s0, s1, s0
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s15, s16, s0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s14, s14, s12
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s13, s0, s12
+; GFX6-NEXT: v_readfirstlane_b32 s16, v2
+; GFX6-NEXT: s_mul_i32 s14, s1, s2
+; GFX6-NEXT: s_mul_i32 s15, s0, s2
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s15
+; GFX6-NEXT: s_add_i32 s13, s13, s14
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s15
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_mul_i32 s16, s2, s13
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_add_u32 s14, s14, s16
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s15, s12, s15
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s17, v4
+; GFX6-NEXT: s_add_u32 s14, s14, s15
+; GFX6-NEXT: s_addc_u32 s14, s16, s17
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
+; GFX6-NEXT: s_addc_u32 s15, s15, 0
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_add_u32 s13, s14, s13
+; GFX6-NEXT: s_addc_u32 s14, 0, s15
+; GFX6-NEXT: s_add_u32 s13, s2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, s14
+; GFX6-NEXT: s_mul_i32 s14, s0, s12
+; GFX6-NEXT: s_mul_i32 s1, s1, s13
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_add_i32 s14, s15, s14
+; GFX6-NEXT: s_mul_i32 s0, s0, s13
+; GFX6-NEXT: s_add_i32 s1, s14, s1
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT: s_mul_i32 s15, s13, s1
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s15, s17, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s0, s12, s0
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_add_u32 s0, s15, s0
+; GFX6-NEXT: s_addc_u32 s0, s16, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_addc_u32 s14, s14, 0
+; GFX6-NEXT: s_mul_i32 s1, s12, s1
+; GFX6-NEXT: s_add_u32 s0, s0, s1
+; GFX6-NEXT: s_addc_u32 s1, 0, s14
+; GFX6-NEXT: s_add_u32 s14, s13, s0
+; GFX6-NEXT: s_addc_u32 s15, s12, s1
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
; GFX6-NEXT: s_mov_b32 s13, s12
; GFX6-NEXT: s_addc_u32 s1, s7, s12
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s15
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT: s_mul_i32 s1, s6, s14
+; GFX6-NEXT: s_mul_i32 s1, s6, s15
; GFX6-NEXT: v_readfirstlane_b32 s16, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
; GFX6-NEXT: s_add_u32 s1, s16, s1
; GFX6-NEXT: s_addc_u32 s4, 0, s4
-; GFX6-NEXT: s_mul_i32 s15, s7, s15
+; GFX6-NEXT: s_mul_i32 s14, s7, s14
; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: s_add_u32 s1, s1, s15
+; GFX6-NEXT: s_add_u32 s1, s1, s14
; GFX6-NEXT: s_addc_u32 s1, s4, s16
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
-; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s16, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_mul_i32 s14, s7, s15
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s4
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s17
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s16
-; GFX6-NEXT: s_add_i32 s18, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s18
-; GFX6-NEXT: s_mul_i32 s4, s10, s16
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s4, s5
-; GFX6-NEXT: s_subb_u32 s19, s14, s11
-; GFX6-NEXT: s_sub_u32 s20, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_u32 s18, s6, s10
+; GFX6-NEXT: s_subb_u32 s17, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s11
; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s20, s16, 2
-; GFX6-NEXT: s_addc_u32 s21, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s20, s15
-; GFX6-NEXT: s_cselect_b32 s15, s21, s19
+; GFX6-NEXT: s_cmp_ge_u32 s18, s10
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s11
+; GFX6-NEXT: s_cselect_b32 s17, s18, s19
+; GFX6-NEXT: s_add_u32 s18, s14, 1
+; GFX6-NEXT: s_addc_u32 s19, s15, 0
+; GFX6-NEXT: s_add_u32 s20, s14, 2
+; GFX6-NEXT: s_addc_u32 s21, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s17, s20, s18
+; GFX6-NEXT: s_cselect_b32 s18, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s18
+; GFX6-NEXT: s_subb_u32 s4, s7, s16
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
@@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_cmp_eq_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s4, s6, s5
; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s5, s15, s17
-; GFX6-NEXT: s_cselect_b32 s4, s14, s16
+; GFX6-NEXT: s_cselect_b32 s5, s18, s15
+; GFX6-NEXT: s_cselect_b32 s4, s17, s14
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
; GFX6-NEXT: s_subb_u32 s5, s5, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT: s_sub_u32 s14, 0, s6
-; GFX6-NEXT: s_subb_u32 s15, 0, s7
+; GFX6-NEXT: s_sub_u32 s12, 0, s6
+; GFX6-NEXT: s_subb_u32 s13, 0, s7
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_mul_i32 s13, s14, s16
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_mul_i32 s16, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s19, v2
-; GFX6-NEXT: s_mul_i32 s17, s15, s12
-; GFX6-NEXT: s_mul_i32 s18, s14, s12
-; GFX6-NEXT: s_add_i32 s13, s19, s13
+; GFX6-NEXT: s_mul_i32 s17, s13, s15
+; GFX6-NEXT: s_mul_i32 s18, s12, s15
+; GFX6-NEXT: s_add_i32 s16, s19, s16
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18
-; GFX6-NEXT: s_add_i32 s13, s13, s17
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: s_add_i32 s16, s16, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s16
; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18
; GFX6-NEXT: v_readfirstlane_b32 s17, v3
-; GFX6-NEXT: s_mul_i32 s20, s12, s13
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_mul_i32 s20, s15, s16
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s16
; GFX6-NEXT: s_add_u32 s17, s17, s20
; GFX6-NEXT: v_readfirstlane_b32 s20, v0
-; GFX6-NEXT: s_mul_i32 s18, s16, s18
+; GFX6-NEXT: s_mul_i32 s18, s14, s18
; GFX6-NEXT: s_addc_u32 s20, 0, s20
; GFX6-NEXT: v_readfirstlane_b32 s19, v4
; GFX6-NEXT: s_add_u32 s17, s17, s18
; GFX6-NEXT: s_addc_u32 s17, s20, s19
; GFX6-NEXT: v_readfirstlane_b32 s18, v1
; GFX6-NEXT: s_addc_u32 s18, s18, 0
-; GFX6-NEXT: s_mul_i32 s13, s16, s13
-; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: s_add_u32 s16, s17, s16
; GFX6-NEXT: s_addc_u32 s17, 0, s18
-; GFX6-NEXT: s_add_u32 s18, s12, s13
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_addc_u32 s16, s16, s17
-; GFX6-NEXT: s_mul_i32 s12, s14, s16
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s15, s15, s18
-; GFX6-NEXT: s_mul_i32 s13, s14, s18
-; GFX6-NEXT: s_add_i32 s12, s12, s15
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0
-; GFX6-NEXT: s_mul_i32 s15, s18, s12
-; GFX6-NEXT: v_readfirstlane_b32 s19, v2
-; GFX6-NEXT: s_add_u32 s15, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s15, s16
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_addc_u32 s14, s14, s17
+; GFX6-NEXT: s_mul_i32 s16, s12, s14
+; GFX6-NEXT: s_mul_i32 s13, s13, s15
; GFX6-NEXT: v_readfirstlane_b32 s17, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s13
-; GFX6-NEXT: s_addc_u32 s17, 0, s17
-; GFX6-NEXT: v_readfirstlane_b32 s14, v3
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: s_addc_u32 s13, s17, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_addc_u32 s14, s14, 0
-; GFX6-NEXT: s_mul_i32 s12, s16, s12
-; GFX6-NEXT: s_add_u32 s12, s13, s12
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: s_add_u32 s15, s18, s12
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_addc_u32 s14, s16, s14
+; GFX6-NEXT: s_add_i32 s16, s17, s16
+; GFX6-NEXT: s_mul_i32 s12, s12, s15
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s15, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s15, v0
+; GFX6-NEXT: s_mul_i32 s17, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s19, v2
+; GFX6-NEXT: s_add_u32 s17, s19, s17
+; GFX6-NEXT: v_readfirstlane_b32 s18, v0
+; GFX6-NEXT: s_mul_i32 s12, s14, s12
+; GFX6-NEXT: s_addc_u32 s18, 0, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v3
+; GFX6-NEXT: s_add_u32 s12, s17, s12
+; GFX6-NEXT: s_addc_u32 s12, s18, s16
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s13, s14, s13
+; GFX6-NEXT: s_add_u32 s12, s12, s13
+; GFX6-NEXT: s_addc_u32 s13, 0, s16
+; GFX6-NEXT: s_add_u32 s15, s15, s12
+; GFX6-NEXT: s_addc_u32 s14, s14, s13
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
; GFX6-NEXT: s_mov_b32 s13, s12
@@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s18, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s17, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s19
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s18
-; GFX6-NEXT: s_add_i32 s20, s14, s15
-; GFX6-NEXT: s_sub_i32 s16, s9, s20
-; GFX6-NEXT: s_mul_i32 s14, s6, s18
+; GFX6-NEXT: s_mul_i32 s15, s7, s17
+; GFX6-NEXT: s_add_i32 s18, s14, s15
+; GFX6-NEXT: s_sub_i32 s19, s9, s18
+; GFX6-NEXT: s_mul_i32 s14, s6, s17
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s14, s15
-; GFX6-NEXT: s_subb_u32 s21, s16, s7
-; GFX6-NEXT: s_sub_u32 s22, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s16, s17
-; GFX6-NEXT: s_subb_u32 s16, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_subb_u32 s19, s19, s7
+; GFX6-NEXT: s_sub_u32 s20, s8, s6
+; GFX6-NEXT: s_subb_u32 s19, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s16, s21, s17
-; GFX6-NEXT: s_add_u32 s17, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b32 s16, s22, s17
-; GFX6-NEXT: s_cselect_b32 s17, s23, s21
+; GFX6-NEXT: s_cmp_ge_u32 s20, s6
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s19, s20, s21
+; GFX6-NEXT: s_add_u32 s20, s17, 1
+; GFX6-NEXT: s_addc_u32 s21, s16, 0
+; GFX6-NEXT: s_add_u32 s22, s17, 2
+; GFX6-NEXT: s_addc_u32 s23, s16, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s19, s22, s20
+; GFX6-NEXT: s_cselect_b32 s20, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s9, s9, s20
+; GFX6-NEXT: s_subb_u32 s9, s9, s18
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
@@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s17, s19
-; GFX6-NEXT: s_cselect_b32 s6, s16, s18
+; GFX6-NEXT: s_cselect_b32 s7, s20, s16
+; GFX6-NEXT: s_cselect_b32 s6, s19, s17
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s16, s6, s2
-; GFX6-NEXT: s_subb_u32 s17, s7, s3
+; GFX6-NEXT: s_sub_u32 s14, s6, s2
+; GFX6-NEXT: s_subb_u32 s15, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT: s_sub_u32 s12, 0, s8
-; GFX6-NEXT: s_subb_u32 s13, 0, s9
+; GFX6-NEXT: s_sub_u32 s2, 0, s8
+; GFX6-NEXT: s_subb_u32 s3, 0, s9
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s3, v2
-; GFX6-NEXT: s_mul_i32 s0, s13, s2
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s12, s2
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT: s_mul_i32 s4, s2, s3
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s13, s2, s12
+; GFX6-NEXT: v_readfirstlane_b32 s16, v2
+; GFX6-NEXT: s_mul_i32 s1, s3, s0
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: s_add_i32 s13, s13, s1
+; GFX6-NEXT: s_mul_i32 s1, s2, s0
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: s_mul_i32 s16, s0, s13
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT: s_add_u32 s4, s18, s4
-; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_add_u32 s16, s18, s16
+; GFX6-NEXT: s_addc_u32 s17, 0, s17
+; GFX6-NEXT: s_mul_i32 s1, s12, s1
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
-; GFX6-NEXT: s_addc_u32 s4, s5, s18
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
-; GFX6-NEXT: s_add_u32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s4, 0, s5
-; GFX6-NEXT: s_add_u32 s5, s2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
-; GFX6-NEXT: s_mul_i32 s2, s12, s4
-; GFX6-NEXT: v_readfirstlane_b32 s3, v0
-; GFX6-NEXT: s_add_i32 s2, s3, s2
-; GFX6-NEXT: s_mul_i32 s13, s13, s5
-; GFX6-NEXT: s_mul_i32 s3, s12, s5
-; GFX6-NEXT: s_add_i32 s2, s2, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_add_u32 s1, s16, s1
+; GFX6-NEXT: s_addc_u32 s1, s17, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_add_u32 s1, s1, s13
+; GFX6-NEXT: s_addc_u32 s13, 0, s16
+; GFX6-NEXT: s_add_u32 s16, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_addc_u32 s4, s12, s13
+; GFX6-NEXT: s_mul_i32 s5, s2, s4
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s5, s12, s5
+; GFX6-NEXT: s_mul_i32 s3, s3, s16
+; GFX6-NEXT: s_mul_i32 s2, s2, s16
+; GFX6-NEXT: s_add_i32 s3, s5, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s3
; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT: s_mul_i32 s12, s16, s3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s12, s17, s12
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
-; GFX6-NEXT: s_add_u32 s2, s3, s2
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s13, s5, s2
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s12, s4, s12
+; GFX6-NEXT: s_addc_u32 s13, 0, s13
+; GFX6-NEXT: v_readfirstlane_b32 s5, v3
+; GFX6-NEXT: s_add_u32 s2, s12, s2
+; GFX6-NEXT: s_addc_u32 s2, s13, s5
+; GFX6-NEXT: v_readfirstlane_b32 s5, v1
+; GFX6-NEXT: s_addc_u32 s5, s5, 0
+; GFX6-NEXT: s_mul_i32 s3, s4, s3
+; GFX6-NEXT: s_add_u32 s2, s2, s3
+; GFX6-NEXT: s_addc_u32 s3, 0, s5
+; GFX6-NEXT: s_add_u32 s12, s16, s2
+; GFX6-NEXT: s_addc_u32 s13, s4, s3
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s2, s10, s13
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s15, s2
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v1
-; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s14, s15
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_addc_u32 s13, s13, 0
+; GFX6-NEXT: s_add_u32 s2, s17, s2
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s18, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: s_add_u32 s2, s2, s12
+; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s13, s11, s13
+; GFX6-NEXT: s_add_u32 s16, s2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s19
+; GFX6-NEXT: s_addc_u32 s17, 0, s12
+; GFX6-NEXT: s_mul_i32 s12, s8, s17
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s18
-; GFX6-NEXT: s_add_i32 s20, s12, s13
-; GFX6-NEXT: s_sub_i32 s14, s11, s20
-; GFX6-NEXT: s_mul_i32 s12, s8, s18
+; GFX6-NEXT: s_mul_i32 s13, s9, s16
+; GFX6-NEXT: s_add_i32 s18, s12, s13
+; GFX6-NEXT: s_sub_i32 s19, s11, s18
+; GFX6-NEXT: s_mul_i32 s12, s8, s16
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s21, s14, s9
-; GFX6-NEXT: s_sub_u32 s22, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_subb_u32 s19, s19, s9
+; GFX6-NEXT: s_sub_u32 s20, s10, s8
+; GFX6-NEXT: s_subb_u32 s19, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s9
; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_add_u32 s15, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s22, s15
-; GFX6-NEXT: s_cselect_b32 s15, s23, s21
+; GFX6-NEXT: s_cmp_ge_u32 s20, s8
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s9
+; GFX6-NEXT: s_cselect_b32 s19, s20, s21
+; GFX6-NEXT: s_add_u32 s20, s16, 1
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_add_u32 s22, s16, 2
+; GFX6-NEXT: s_addc_u32 s23, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s19, s22, s20
+; GFX6-NEXT: s_cselect_b32 s20, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s11, s11, s20
+; GFX6-NEXT: s_subb_u32 s11, s11, s18
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
@@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s15, s19
-; GFX6-NEXT: s_cselect_b32 s8, s14, s18
+; GFX6-NEXT: s_cselect_b32 s9, s20, s17
+; GFX6-NEXT: s_cselect_b32 s8, s19, s16
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT: s_sub_u32 s10, 0, s8
-; GFX6-NEXT: s_subb_u32 s11, 0, s9
+; GFX6-NEXT: s_sub_u32 s0, 0, s8
+; GFX6-NEXT: s_subb_u32 s1, 0, s9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s1, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_mul_i32 s13, s11, s0
-; GFX6-NEXT: s_mul_i32 s14, s10, s0
-; GFX6-NEXT: s_add_i32 s1, s15, s1
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14
-; GFX6-NEXT: s_add_i32 s1, s1, s13
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14
-; GFX6-NEXT: v_readfirstlane_b32 s13, v3
-; GFX6-NEXT: s_mul_i32 s15, s0, s1
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT: s_add_u32 s13, s13, s15
-; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: s_mul_i32 s14, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s16, v4
-; GFX6-NEXT: s_add_u32 s13, s13, s14
-; GFX6-NEXT: s_addc_u32 s13, s15, s16
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_addc_u32 s14, s14, 0
-; GFX6-NEXT: s_mul_i32 s1, s12, s1
-; GFX6-NEXT: s_add_u32 s1, s13, s1
-; GFX6-NEXT: s_addc_u32 s13, 0, s14
-; GFX6-NEXT: s_add_u32 s14, s0, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s12, s12, s13
-; GFX6-NEXT: s_mul_i32 s0, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s1, v0
-; GFX6-NEXT: s_add_i32 s0, s1, s0
-; GFX6-NEXT: s_mul_i32 s11, s11, s14
-; GFX6-NEXT: s_mul_i32 s1, s10, s14
-; GFX6-NEXT: s_add_i32 s0, s0, s11
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT: s_mul_i32 s11, s14, s0
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s11, s15, s11
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s1
-; GFX6-NEXT: s_addc_u32 s13, 0, s13
-; GFX6-NEXT: v_readfirstlane_b32 s10, v3
-; GFX6-NEXT: s_add_u32 s1, s11, s1
-; GFX6-NEXT: s_addc_u32 s1, s13, s10
+; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
-; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s0, s12, s0
-; GFX6-NEXT: s_add_u32 s0, s1, s0
-; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_add_u32 s13, s14, s0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s12, s12, s10
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s11, s0, s10
+; GFX6-NEXT: v_readfirstlane_b32 s14, v2
+; GFX6-NEXT: s_mul_i32 s12, s1, s2
+; GFX6-NEXT: s_mul_i32 s13, s0, s2
+; GFX6-NEXT: s_add_i32 s11, s14, s11
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13
+; GFX6-NEXT: s_add_i32 s11, s11, s12
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s11
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s13
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_mul_i32 s14, s2, s11
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s11
+; GFX6-NEXT: s_add_u32 s12, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_mul_i32 s13, s10, s13
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: v_readfirstlane_b32 s15, v4
+; GFX6-NEXT: s_add_u32 s12, s12, s13
+; GFX6-NEXT: s_addc_u32 s12, s14, s15
+; GFX6-NEXT: v_readfirstlane_b32 s13, v1
+; GFX6-NEXT: s_addc_u32 s13, s13, 0
+; GFX6-NEXT: s_mul_i32 s11, s10, s11
+; GFX6-NEXT: s_add_u32 s11, s12, s11
+; GFX6-NEXT: s_addc_u32 s12, 0, s13
+; GFX6-NEXT: s_add_u32 s11, s2, s11
+; GFX6-NEXT: v_mov_b32_e32 v0, s11
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT: s_addc_u32 s10, s10, s12
+; GFX6-NEXT: s_mul_i32 s12, s0, s10
+; GFX6-NEXT: s_mul_i32 s1, s1, s11
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
+; GFX6-NEXT: s_add_i32 s12, s13, s12
+; GFX6-NEXT: s_mul_i32 s0, s0, s11
+; GFX6-NEXT: s_add_i32 s1, s12, s1
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT: s_mul_i32 s13, s11, s1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_mul_i32 s0, s10, s0
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_add_u32 s0, s13, s0
+; GFX6-NEXT: s_addc_u32 s0, s14, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s1, s10, s1
+; GFX6-NEXT: s_add_u32 s0, s0, s1
+; GFX6-NEXT: s_addc_u32 s1, 0, s12
+; GFX6-NEXT: s_add_u32 s12, s11, s0
+; GFX6-NEXT: s_addc_u32 s13, s10, s1
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
; GFX6-NEXT: s_mov_b32 s11, s10
; GFX6-NEXT: s_addc_u32 s1, s7, s10
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT: s_mul_i32 s1, s6, s12
+; GFX6-NEXT: s_mul_i32 s1, s6, s13
; GFX6-NEXT: v_readfirstlane_b32 s14, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
; GFX6-NEXT: s_add_u32 s1, s14, s1
; GFX6-NEXT: s_addc_u32 s4, 0, s4
-; GFX6-NEXT: s_mul_i32 s13, s7, s13
+; GFX6-NEXT: s_mul_i32 s12, s7, s12
; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_add_u32 s1, s1, s13
+; GFX6-NEXT: s_add_u32 s1, s1, s12
; GFX6-NEXT: s_addc_u32 s1, s4, s14
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
-; GFX6-NEXT: s_mul_i32 s12, s7, s12
+; GFX6-NEXT: s_mul_i32 s12, s7, s13
; GFX6-NEXT: s_add_u32 s12, s1, s12
; GFX6-NEXT: v_mov_b32_e32 v0, s12
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
@@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s4, s5
; GFX6-NEXT: s_subb_u32 s15, s13, s9
; GFX6-NEXT: s_sub_u32 s16, s6, s8
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s12, s13
; GFX6-NEXT: s_subb_u32 s17, s15, 0
; GFX6-NEXT: s_cmp_ge_u32 s17, s9
; GFX6-NEXT: s_cselect_b32 s18, -1, 0
@@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_cmp_eq_u32 s17, s9
; GFX6-NEXT: s_cselect_b32 s18, s19, s18
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s15, s15, s9
-; GFX6-NEXT: s_sub_u32 s19, s16, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_subb_u32 s12, s15, s9
+; GFX6-NEXT: s_sub_u32 s13, s16, s8
+; GFX6-NEXT: s_subb_u32 s12, s12, 0
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s13, s13, s16
; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
; GFX6-NEXT: s_subb_u32 s4, s7, s14
@@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT: s_sub_u32 s12, 0, s2
-; GFX6-NEXT: s_subb_u32 s13, 0, s3
+; GFX6-NEXT: s_sub_u32 s6, 0, s2
+; GFX6-NEXT: s_subb_u32 s7, 0, s3
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s6, v0
-; GFX6-NEXT: s_mul_i32 s7, s12, s14
+; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
+; GFX6-NEXT: s_mul_i32 s14, s6, s12
; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_mul_i32 s15, s13, s6
-; GFX6-NEXT: s_mul_i32 s16, s12, s6
-; GFX6-NEXT: s_add_i32 s7, s17, s7
+; GFX6-NEXT: s_mul_i32 s15, s7, s13
+; GFX6-NEXT: s_mul_i32 s16, s6, s13
+; GFX6-NEXT: s_add_i32 s14, s17, s14
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT: s_add_i32 s7, s7, s15
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7
+; GFX6-NEXT: s_add_i32 s14, s14, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s14
; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v3
-; GFX6-NEXT: s_mul_i32 s18, s6, s7
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7
+; GFX6-NEXT: s_mul_i32 s18, s13, s14
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s14
; GFX6-NEXT: s_add_u32 s15, s15, s18
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: s_mul_i32 s16, s12, s16
; GFX6-NEXT: s_addc_u32 s18, 0, s18
; GFX6-NEXT: v_readfirstlane_b32 s17, v4
; GFX6-NEXT: s_add_u32 s15, s15, s16
; GFX6-NEXT: s_addc_u32 s15, s18, s17
; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: s_addc_u32 s16, s16, 0
-; GFX6-NEXT: s_mul_i32 s7, s14, s7
-; GFX6-NEXT: s_add_u32 s7, s15, s7
+; GFX6-NEXT: s_mul_i32 s14, s12, s14
+; GFX6-NEXT: s_add_u32 s14, s15, s14
; GFX6-NEXT: s_addc_u32 s15, 0, s16
-; GFX6-NEXT: s_add_u32 s16, s6, s7
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_addc_u32 s14, s14, s15
-; GFX6-NEXT: s_mul_i32 s6, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_add_i32 s6, s7, s6
-; GFX6-NEXT: s_mul_i32 s13, s13, s16
-; GFX6-NEXT: s_mul_i32 s7, s12, s16
-; GFX6-NEXT: s_add_i32 s6, s6, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s7
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s6
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: s_add_u32 s13, s13, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, s15
+; GFX6-NEXT: s_mul_i32 s14, s6, s12
+; GFX6-NEXT: s_mul_i32 s7, s7, s13
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_mul_i32 s7, s14, s7
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s7, s13, s7
-; GFX6-NEXT: s_addc_u32 s7, s15, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
-; GFX6-NEXT: s_mul_i32 s6, s14, s6
-; GFX6-NEXT: s_add_u32 s6, s7, s6
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s13, s16, s6
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_addc_u32 s12, s14, s12
+; GFX6-NEXT: s_add_i32 s14, s15, s14
+; GFX6-NEXT: s_mul_i32 s6, s6, s13
+; GFX6-NEXT: s_add_i32 s7, s14, s7
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: v_mov_b32_e32 v0, s7
+; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT: s_mul_i32 s15, s13, s7
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s15, s17, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s6, s12, s6
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_add_u32 s6, s15, s6
+; GFX6-NEXT: s_addc_u32 s6, s16, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_addc_u32 s14, s14, 0
+; GFX6-NEXT: s_mul_i32 s7, s12, s7
+; GFX6-NEXT: s_add_u32 s6, s6, s7
+; GFX6-NEXT: s_addc_u32 s7, 0, s14
+; GFX6-NEXT: s_add_u32 s13, s13, s6
+; GFX6-NEXT: s_addc_u32 s12, s12, s7
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
; GFX6-NEXT: s_subb_u32 s17, s14, s3
; GFX6-NEXT: s_sub_u32 s18, s8, s2
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s14, s15
; GFX6-NEXT: s_subb_u32 s19, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s19, s3
; GFX6-NEXT: s_cselect_b32 s20, -1, 0
@@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s19, s3
; GFX6-NEXT: s_cselect_b32 s20, s21, s20
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s17, s17, s3
-; GFX6-NEXT: s_sub_u32 s21, s18, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_subb_u32 s14, s17, s3
+; GFX6-NEXT: s_sub_u32 s15, s18, s2
+; GFX6-NEXT: s_subb_u32 s14, s14, 0
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s15, s15, s18
; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
; GFX6-NEXT: s_subb_u32 s9, s9, s16
@@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT: s_sub_u32 s8, 0, s6
-; GFX6-NEXT: s_subb_u32 s9, 0, s7
+; GFX6-NEXT: s_sub_u32 s2, 0, s6
+; GFX6-NEXT: s_subb_u32 s3, 0, s7
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s12
-; GFX6-NEXT: v_readfirstlane_b32 s3, v2
-; GFX6-NEXT: s_mul_i32 s0, s9, s2
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s8, s2
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT: s_mul_i32 s4, s2, s3
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s9, s2, s8
+; GFX6-NEXT: v_readfirstlane_b32 s12, v2
+; GFX6-NEXT: s_mul_i32 s1, s3, s0
+; GFX6-NEXT: s_add_i32 s9, s12, s9
+; GFX6-NEXT: s_add_i32 s9, s9, s1
+; GFX6-NEXT: s_mul_i32 s1, s2, s0
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, s9
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: s_mul_i32 s12, s0, s9
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT: s_add_u32 s4, s16, s4
-; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s9
+; GFX6-NEXT: s_add_u32 s12, s16, s12
+; GFX6-NEXT: s_addc_u32 s13, 0, s13
+; GFX6-NEXT: s_mul_i32 s1, s8, s1
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s13
-; GFX6-NEXT: s_addc_u32 s4, s5, s16
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s12, s3
-; GFX6-NEXT: s_add_u32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s4, 0, s5
-; GFX6-NEXT: s_add_u32 s5, s2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s12, s4
-; GFX6-NEXT: s_mul_i32 s2, s8, s4
-; GFX6-NEXT: v_readfirstlane_b32 s3, v0
-; GFX6-NEXT: s_add_i32 s2, s3, s2
-; GFX6-NEXT: s_mul_i32 s9, s9, s5
-; GFX6-NEXT: s_mul_i32 s3, s8, s5
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_mov_b32_e32 v2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_add_u32 s1, s12, s1
+; GFX6-NEXT: s_addc_u32 s1, s13, s16
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s9, s8, s9
+; GFX6-NEXT: s_add_u32 s1, s1, s9
+; GFX6-NEXT: s_addc_u32 s9, 0, s12
+; GFX6-NEXT: s_add_u32 s12, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_addc_u32 s4, s8, s9
+; GFX6-NEXT: s_mul_i32 s5, s2, s4
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: s_add_i32 s5, s8, s5
+; GFX6-NEXT: s_mul_i32 s3, s3, s12
+; GFX6-NEXT: s_mul_i32 s2, s2, s12
+; GFX6-NEXT: s_add_i32 s3, s5, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s3
; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v2
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT: s_mul_i32 s9, s5, s2
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_mul_i32 s8, s12, s3
; GFX6-NEXT: v_readfirstlane_b32 s13, v2
-; GFX6-NEXT: s_add_u32 s9, s13, s9
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: v_readfirstlane_b32 s8, v3
-; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s12, s8
-; GFX6-NEXT: v_readfirstlane_b32 s8, v1
-; GFX6-NEXT: s_addc_u32 s8, s8, 0
+; GFX6-NEXT: s_add_u32 s8, s13, s8
+; GFX6-NEXT: v_readfirstlane_b32 s9, v0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
-; GFX6-NEXT: s_add_u32 s2, s3, s2
-; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s12, s5, s2
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s13, s4, s8
+; GFX6-NEXT: s_addc_u32 s9, 0, s9
+; GFX6-NEXT: v_readfirstlane_b32 s5, v3
+; GFX6-NEXT: s_add_u32 s2, s8, s2
+; GFX6-NEXT: s_addc_u32 s2, s9, s5
+; GFX6-NEXT: v_readfirstlane_b32 s5, v1
+; GFX6-NEXT: s_addc_u32 s5, s5, 0
+; GFX6-NEXT: s_mul_i32 s3, s4, s3
+; GFX6-NEXT: s_add_u32 s2, s2, s3
+; GFX6-NEXT: s_addc_u32 s3, 0, s5
+; GFX6-NEXT: s_add_u32 s12, s12, s2
+; GFX6-NEXT: s_addc_u32 s13, s4, s3
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
@@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s13, s10, s11
; GFX6-NEXT: s_subb_u32 s17, s12, s7
; GFX6-NEXT: s_sub_u32 s18, s8, s6
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s12, s13
; GFX6-NEXT: s_subb_u32 s19, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s20, -1, 0
@@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s20, s21, s20
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s17, s7
-; GFX6-NEXT: s_sub_u32 s21, s18, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_subb_u32 s12, s17, s7
+; GFX6-NEXT: s_sub_u32 s13, s18, s6
+; GFX6-NEXT: s_subb_u32 s12, s12, 0
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s13, s13, s18
; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
; GFX6-NEXT: s_subb_u32 s9, s9, s16
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de173dc8c6..8d05317162e9c 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_add_u32 s4, s4, s6
-; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT: s_or_b32 s6, s12, s13
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_sub_u32 s4, s4, s6
-; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT: s_or_b32 s6, s12, s13
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e3c533d..71af21a11c2ce 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s7, s6, s6
-; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -88,15 +86,13 @@ bb:
define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
-; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_add_u32 s0, s2, s2
-; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_addc_u32 s0, s2, 0
+; GFX7-NEXT: s_add_u32 s1, s0, s0
+; GFX7-NEXT: s_addc_u32 s0, s0, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94a7f245..74a6d7fe39362 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_ashr_i32 s8, s1, 31
; GCN-NEXT: s_add_u32 s0, s0, s8
@@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT: s_sub_u32 s12, 0, s10
-; GCN-NEXT: s_subb_u32 s13, 0, s11
+; GCN-NEXT: s_sub_u32 s0, 0, s10
+; GCN-NEXT: s_subb_u32 s1, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s17, v2
-; GCN-NEXT: s_mul_i32 s15, s13, s0
-; GCN-NEXT: s_mul_i32 s16, s12, s0
-; GCN-NEXT: s_add_i32 s1, s17, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s16
-; GCN-NEXT: s_add_i32 s1, s1, s15
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s16
-; GCN-NEXT: v_readfirstlane_b32 s15, v3
-; GCN-NEXT: s_mul_i32 s17, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s15, s15, s17
-; GCN-NEXT: v_readfirstlane_b32 s17, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s17
-; GCN-NEXT: s_mul_i32 s16, s14, s16
-; GCN-NEXT: v_readfirstlane_b32 s18, v4
-; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_addc_u32 s15, s17, s18
-; GCN-NEXT: v_readfirstlane_b32 s16, v1
-; GCN-NEXT: s_addc_u32 s16, s16, 0
-; GCN-NEXT: s_mul_i32 s1, s14, s1
-; GCN-NEXT: s_add_u32 s1, s15, s1
-; GCN-NEXT: s_addc_u32 s15, 0, s16
-; GCN-NEXT: s_add_u32 s16, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s14, s14, s15
-; GCN-NEXT: s_mul_i32 s0, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s1, s12, s16
-; GCN-NEXT: s_add_i32 s0, s0, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s16, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s14, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s16, v0
-; GCN-NEXT: s_mul_i32 s13, s16, s0
-; GCN-NEXT: v_readfirstlane_b32 s17, v2
-; GCN-NEXT: s_add_u32 s13, s17, s13
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s1, s14, s1
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s12, v3
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s1, s15, s12
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s0, s14, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s15, s16, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s14, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s13, s0, s12
+; GCN-NEXT: v_readfirstlane_b32 s16, v2
+; GCN-NEXT: s_mul_i32 s14, s1, s2
+; GCN-NEXT: s_mul_i32 s15, s0, s2
+; GCN-NEXT: s_add_i32 s13, s16, s13
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s15
+; GCN-NEXT: s_add_i32 s13, s13, s14
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s13
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s15
+; GCN-NEXT: v_readfirstlane_b32 s14, v3
+; GCN-NEXT: s_mul_i32 s16, s2, s13
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s13
+; GCN-NEXT: s_add_u32 s14, s14, s16
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s15, s12, s15
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: v_readfirstlane_b32 s17, v4
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s14, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: s_addc_u32 s15, s15, 0
+; GCN-NEXT: s_mul_i32 s13, s12, s13
+; GCN-NEXT: s_add_u32 s13, s14, s13
+; GCN-NEXT: s_addc_u32 s14, 0, s15
+; GCN-NEXT: s_add_u32 s13, s2, s13
+; GCN-NEXT: v_mov_b32_e32 v0, s13
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s14, s0, s12
+; GCN-NEXT: s_mul_i32 s1, s1, s13
+; GCN-NEXT: v_readfirstlane_b32 s15, v0
+; GCN-NEXT: s_add_i32 s14, s15, s14
+; GCN-NEXT: s_mul_i32 s0, s0, s13
+; GCN-NEXT: s_add_i32 s1, s14, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s13, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT: s_mul_i32 s15, s13, s1
+; GCN-NEXT: v_readfirstlane_b32 s17, v2
+; GCN-NEXT: s_add_u32 s15, s17, s15
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s0, s12, s0
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: v_readfirstlane_b32 s14, v3
+; GCN-NEXT: s_add_u32 s0, s15, s0
+; GCN-NEXT: s_addc_u32 s0, s16, s14
+; GCN-NEXT: v_readfirstlane_b32 s14, v1
+; GCN-NEXT: s_addc_u32 s14, s14, 0
+; GCN-NEXT: s_mul_i32 s1, s12, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s14
+; GCN-NEXT: s_add_u32 s14, s13, s0
+; GCN-NEXT: s_addc_u32 s15, s12, s1
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
; GCN-NEXT: s_mov_b32 s13, s12
; GCN-NEXT: s_addc_u32 s1, s7, s12
; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s15
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
-; GCN-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_mul_hi_u32 v1, s7, v2
-; GCN-NEXT: s_mul_i32 s1, s6, s14
+; GCN-NEXT: s_mul_i32 s1, s6, s15
; GCN-NEXT: v_readfirstlane_b32 s16, v3
; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
; GCN-NEXT: s_add_u32 s1, s16, s1
; GCN-NEXT: s_addc_u32 s4, 0, s4
-; GCN-NEXT: s_mul_i32 s15, s7, s15
+; GCN-NEXT: s_mul_i32 s14, s7, s14
; GCN-NEXT: v_readfirstlane_b32 s16, v1
-; GCN-NEXT: s_add_u32 s1, s1, s15
+; GCN-NEXT: s_add_u32 s1, s1, s14
; GCN-NEXT: s_addc_u32 s1, s4, s16
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
-; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s16, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: s_mul_i32 s14, s7, s15
+; GCN-NEXT: s_add_u32 s14, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s4
+; GCN-NEXT: s_addc_u32 s15, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s17
+; GCN-NEXT: s_mul_i32 s4, s10, s15
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s16
-; GCN-NEXT: s_add_i32 s18, s4, s5
-; GCN-NEXT: s_sub_i32 s14, s7, s18
-; GCN-NEXT: s_mul_i32 s4, s10, s16
+; GCN-NEXT: s_mul_i32 s5, s11, s14
+; GCN-NEXT: s_add_i32 s16, s4, s5
+; GCN-NEXT: s_sub_i32 s17, s7, s16
+; GCN-NEXT: s_mul_i32 s4, s10, s14
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s15, s4, s5
-; GCN-NEXT: s_subb_u32 s19, s14, s11
-; GCN-NEXT: s_sub_u32 s20, s6, s10
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_or_b32 s14, s14, s15
-; GCN-NEXT: s_subb_u32 s14, s19, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s20, s10
+; GCN-NEXT: s_subb_u32 s17, s17, s11
+; GCN-NEXT: s_sub_u32 s18, s6, s10
+; GCN-NEXT: s_subb_u32 s17, s17, 0
+; GCN-NEXT: s_cmp_ge_u32 s17, s11
; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s14, s19, s15
-; GCN-NEXT: s_add_u32 s15, s16, 1
-; GCN-NEXT: s_addc_u32 s19, s17, 0
-; GCN-NEXT: s_add_u32 s20, s16, 2
-; GCN-NEXT: s_addc_u32 s21, s17, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s14, s20, s15
-; GCN-NEXT: s_cselect_b32 s15, s21, s19
+; GCN-NEXT: s_cmp_ge_u32 s18, s10
+; GCN-NEXT: s_cselect_b32 s18, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s17, s11
+; GCN-NEXT: s_cselect_b32 s17, s18, s19
+; GCN-NEXT: s_add_u32 s18, s14, 1
+; GCN-NEXT: s_addc_u32 s19, s15, 0
+; GCN-NEXT: s_add_u32 s20, s14, 2
+; GCN-NEXT: s_addc_u32 s21, s15, 0
+; GCN-NEXT: s_cmp_lg_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s17, s20, s18
+; GCN-NEXT: s_cselect_b32 s18, s21, s19
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s18
+; GCN-NEXT: s_subb_u32 s4, s7, s16
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s10
@@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_eq_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s4, s6, s5
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s15, s17
-; GCN-NEXT: s_cselect_b32 s4, s14, s16
+; GCN-NEXT: s_cselect_b32 s5, s18, s15
+; GCN-NEXT: s_cselect_b32 s4, s17, s14
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
; GCN-NEXT: s_subb_u32 s5, s5, s7
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s10, s10, s11
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_sub_u32 s16, s16, s20
; GCN-IR-NEXT: s_subb_u32 s17, s17, s21
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT: s_or_b32 s20, s20, s21
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s2, 0, s6
-; GCN-NEXT: s_subb_u32 s10, 0, s7
-; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_subb_u32 s8, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_mul_i32 s9, s2, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v2
-; GCN-NEXT: s_mul_i32 s12, s10, s8
-; GCN-NEXT: s_mul_i32 s13, s2, s8
-; GCN-NEXT: s_add_i32 s9, s14, s9
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
-; GCN-NEXT: s_add_i32 s9, s9, s12
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v3
-; GCN-NEXT: s_mul_i32 s15, s8, s9
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s9
-; GCN-NEXT: s_add_u32 s12, s12, s15
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s14, v4
-; GCN-NEXT: s_add_u32 s12, s12, s13
-; GCN-NEXT: s_addc_u32 s12, s15, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s9, s11, s9
-; GCN-NEXT: s_add_u32 s9, s12, s9
-; GCN-NEXT: s_addc_u32 s12, 0, s13
-; GCN-NEXT: s_add_u32 s13, s8, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s13
+; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: s_mul_i32 s10, s2, s9
+; GCN-NEXT: v_readfirstlane_b32 s13, v2
+; GCN-NEXT: s_mul_i32 s11, s8, s3
+; GCN-NEXT: s_mul_i32 s12, s2, s3
+; GCN-NEXT: s_add_i32 s10, s13, s10
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s12
+; GCN-NEXT: s_add_i32 s10, s10, s11
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v3
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s10
+; GCN-NEXT: s_mul_i32 s14, s3, s10
+; GCN-NEXT: s_add_u32 s11, s11, s14
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s12, s9, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v4
+; GCN-NEXT: s_add_u32 s11, s11, s12
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: s_addc_u32 s11, s14, s13
+; GCN-NEXT: s_addc_u32 s12, s15, 0
+; GCN-NEXT: s_mul_i32 s10, s9, s10
+; GCN-NEXT: s_add_u32 s10, s11, s10
+; GCN-NEXT: s_addc_u32 s11, 0, s12
+; GCN-NEXT: s_add_u32 s10, s3, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s11, s11, s12
-; GCN-NEXT: s_mul_i32 s8, s2, s11
-; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s8, s9, s8
-; GCN-NEXT: s_mul_i32 s10, s10, s13
-; GCN-NEXT: s_mul_i32 s2, s2, s13
-; GCN-NEXT: s_add_i32 s8, s8, s10
+; GCN-NEXT: s_addc_u32 s9, s9, s11
+; GCN-NEXT: s_mul_i32 s11, s2, s9
+; GCN-NEXT: s_mul_i32 s8, s8, s10
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_add_i32 s11, s12, s11
+; GCN-NEXT: s_mul_i32 s2, s2, s10
+; GCN-NEXT: s_add_i32 s8, s11, s8
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mul_hi_u32 v3, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s13, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s11, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
-; GCN-NEXT: s_mul_i32 s10, s13, s8
+; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_mul_i32 s12, s10, s8
; GCN-NEXT: v_readfirstlane_b32 s14, v2
-; GCN-NEXT: s_add_u32 s10, s14, s10
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s2, s11, s2
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: v_readfirstlane_b32 s9, v3
-; GCN-NEXT: s_add_u32 s2, s10, s2
-; GCN-NEXT: s_addc_u32 s2, s12, s9
-; GCN-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mul_i32 s8, s11, s8
+; GCN-NEXT: s_add_u32 s12, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s13, v0
+; GCN-NEXT: s_mul_i32 s2, s9, s2
+; GCN-NEXT: s_addc_u32 s13, 0, s13
+; GCN-NEXT: v_readfirstlane_b32 s11, v3
+; GCN-NEXT: s_add_u32 s2, s12, s2
+; GCN-NEXT: s_addc_u32 s2, s13, s11
+; GCN-NEXT: v_readfirstlane_b32 s11, v1
+; GCN-NEXT: s_addc_u32 s11, s11, 0
+; GCN-NEXT: s_mul_i32 s8, s9, s8
; GCN-NEXT: s_add_u32 s2, s2, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s9
-; GCN-NEXT: s_add_u32 s2, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s8, s11, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s11
+; GCN-NEXT: s_add_u32 s2, s10, s2
+; GCN-NEXT: s_addc_u32 s8, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
; GCN-NEXT: s_mul_i32 s8, s8, 24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s12, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: s_addc_u32 s10, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s12
+; GCN-NEXT: s_mul_i32 s8, s7, s10
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s13, s9, s8
-; GCN-NEXT: s_sub_i32 s10, 0, s13
-; GCN-NEXT: s_mul_i32 s8, s6, s12
-; GCN-NEXT: s_sub_u32 s14, 24, s8
+; GCN-NEXT: s_add_i32 s11, s9, s8
+; GCN-NEXT: s_sub_i32 s12, 0, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s10
+; GCN-NEXT: s_sub_u32 s13, 24, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s11, s8, s9
-; GCN-NEXT: s_subb_u32 s15, s10, s7
-; GCN-NEXT: s_sub_u32 s16, s14, s6
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s6
+; GCN-NEXT: s_subb_u32 s12, s12, s7
+; GCN-NEXT: s_sub_u32 s14, s13, s6
+; GCN-NEXT: s_subb_u32 s12, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s12, s7
; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s10, s15, s11
-; GCN-NEXT: s_add_u32 s11, s12, 1
+; GCN-NEXT: s_cmp_ge_u32 s14, s6
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s12, s7
+; GCN-NEXT: s_cselect_b32 s12, s14, s15
+; GCN-NEXT: s_add_u32 s14, s10, 1
; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_add_u32 s16, s12, 2
+; GCN-NEXT: s_add_u32 s16, s10, 2
; GCN-NEXT: s_addc_u32 s17, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_cselect_b32 s10, s16, s11
-; GCN-NEXT: s_cselect_b32 s11, s17, s15
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_cselect_b32 s12, s16, s14
+; GCN-NEXT: s_cselect_b32 s14, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, 0, s13
+; GCN-NEXT: s_subb_u32 s8, 0, s11
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s6
+; GCN-NEXT: s_cmp_ge_u32 s13, s6
; GCN-NEXT: s_cselect_b32 s6, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s6, s6, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s11, 0
-; GCN-NEXT: s_cselect_b32 s6, s10, s12
+; GCN-NEXT: s_cselect_b32 s7, s14, 0
+; GCN-NEXT: s_cselect_b32 s6, s12, s10
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb0417dfa4..862e2dd2de051 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT: s_sub_u32 s10, 0, s8
-; GCN-NEXT: s_subb_u32 s11, 0, s9
+; GCN-NEXT: s_sub_u32 s0, 0, s8
+; GCN-NEXT: s_subb_u32 s1, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s0
-; GCN-NEXT: s_mul_i32 s14, s10, s0
-; GCN-NEXT: s_add_i32 s1, s15, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s1, s1, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s13, s13, s15
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s0, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s1, s2
+; GCN-NEXT: s_mul_i32 s13, s0, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s15, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s15
; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s14, s12, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s0, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s1, s10, s14
-; GCN-NEXT: s_add_i32 s0, s0, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s0
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: v_readfirstlane_b32 s14, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s15, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s0, s10
+; GCN-NEXT: s_mul_i32 s1, s1, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s1, s11, s1
-; GCN-NEXT: s_addc_u32 s1, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s0, s12, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s1, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s0, s0, s11
+; GCN-NEXT: s_add_i32 s1, s12, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s1
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s0, s10, s0
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s0, s13, s0
+; GCN-NEXT: s_addc_u32 s0, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s1, s10, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s0
+; GCN-NEXT: s_addc_u32 s1, s10, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
; GCN-NEXT: v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
; GCN-NEXT: s_subb_u32 s13, s10, s9
; GCN-NEXT: s_sub_u32 s14, s6, s8
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
; GCN-NEXT: s_subb_u32 s15, s13, 0
; GCN-NEXT: s_cmp_ge_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_eq_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, s17, s16
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_subb_u32 s10, s13, s9
+; GCN-NEXT: s_sub_u32 s11, s14, s8
+; GCN-NEXT: s_subb_u32 s10, s10, 0
; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s11, s11, s14
; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_lg_u32 s5, 0
; GCN-NEXT: s_cselect_b32 s4, s10, s4
; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GCN-NEXT: s_sub_u32 s10, 0, s4
-; GCN-NEXT: s_subb_u32 s11, 0, s5
+; GCN-NEXT: s_sub_u32 s8, 0, s4
+; GCN-NEXT: s_subb_u32 s9, 0, s5
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_mul_i32 s9, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s8
-; GCN-NEXT: s_mul_i32 s14, s10, s8
-; GCN-NEXT: s_add_i32 s9, s15, s9
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s9, s9, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s8, s9
-; GCN-NEXT: s_add_u32 s13, s13, s15
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: v_mul_hi_u32 v0, v1, s9
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: s_mul_i32 s14, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s8, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s9, s2
+; GCN-NEXT: s_mul_i32 s13, s8, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s14, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s9, s12, s9
-; GCN-NEXT: s_add_u32 s9, s13, s9
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s8, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s8, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s8, s9, s8
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s9, s10, s14
-; GCN-NEXT: s_add_i32 s8, s8, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s8
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
+; GCN-NEXT: v_readfirstlane_b32 s15, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s8, s10
+; GCN-NEXT: s_mul_i32 s9, s9, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s9, s12, s9
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s9, s11, s9
-; GCN-NEXT: s_addc_u32 s9, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s8, s12, s8
-; GCN-NEXT: s_add_u32 s8, s9, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s8
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s10, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s8, s8, s11
+; GCN-NEXT: s_add_i32 s9, s12, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s9
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s8, s10, s8
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s8, s13, s8
+; GCN-NEXT: s_addc_u32 s8, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s9, s10, s9
+; GCN-NEXT: s_add_u32 s8, s8, s9
+; GCN-NEXT: s_addc_u32 s9, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s8
+; GCN-NEXT: s_addc_u32 s10, s10, s9
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
; GCN-NEXT: s_mov_b32 s9, s8
@@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-NEXT: s_addc_u32 s11, 0, s12
; GCN-NEXT: s_mul_i32 s11, s4, s11
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
@@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s13, s10, s11
; GCN-NEXT: s_subb_u32 s15, s12, s5
; GCN-NEXT: s_sub_u32 s16, s6, s4
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s17, s12, s13
; GCN-NEXT: s_subb_u32 s17, s15, 0
; GCN-NEXT: s_cmp_ge_u32 s17, s5
; GCN-NEXT: s_cselect_b32 s18, -1, 0
@@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_cmp_eq_u32 s17, s5
; GCN-NEXT: s_cselect_b32 s18, s19, s18
; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s15, s15, s5
-; GCN-NEXT: s_sub_u32 s19, s16, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s12, s15, 0
+; GCN-NEXT: s_subb_u32 s12, s15, s5
+; GCN-NEXT: s_sub_u32 s13, s16, s4
+; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_cselect_b32 s13, s19, s16
+; GCN-NEXT: s_cselect_b32 s13, s13, s16
; GCN-NEXT: s_cselect_b32 s12, s12, s17
; GCN-NEXT: s_or_b32 s10, s10, s11
; GCN-NEXT: s_subb_u32 s7, s7, s14
@@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s10, s10, s11
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_sub_u32 s14, s14, s20
; GCN-IR-NEXT: s_subb_u32 s15, s15, s21
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
-; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT: s_or_b32 s20, s20, s21
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5
; GCN-NEXT: s_sub_u32 s2, 0, s4
-; GCN-NEXT: s_subb_u32 s8, 0, s5
-; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_subb_u32 s6, 0, s5
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NEXT: s_mul_i32 s7, s2, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s6
-; GCN-NEXT: s_mul_i32 s11, s2, s6
-; GCN-NEXT: s_add_i32 s7, s12, s7
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s7, s7, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_mul_i32 s13, s6, s7
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s7
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: s_addc_u32 s11, s11, 0
-; GCN-NEXT: s_mul_i32 s7, s9, s7
-; GCN-NEXT: s_add_u32 s7, s10, s7
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s6, s7
-; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: s_mul_i32 s8, s2, s7
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s6, s3
+; GCN-NEXT: s_mul_i32 s10, s2, s3
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_mul_i32 s12, s3, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s7, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_mul_i32 s8, s7, s8
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s3, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s6, s2, s9
-; GCN-NEXT: v_readfirstlane_b32 s7, v0
-; GCN-NEXT: s_add_i32 s6, s7, s6
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s2, s2, s11
-; GCN-NEXT: s_add_i32 s6, s6, s8
+; GCN-NEXT: s_addc_u32 s7, s7, s9
+; GCN-NEXT: s_mul_i32 s9, s2, s7
+; GCN-NEXT: s_mul_i32 s6, s6, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s2, s2, s8
+; GCN-NEXT: s_add_i32 s6, s9, s6
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s6
+; GCN-NEXT: v_mul_hi_u32 v3, s7, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s6
; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s2, s9, s2
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s7, v3
-; GCN-NEXT: s_add_u32 s2, s8, s2
-; GCN-NEXT: s_addc_u32 s2, s10, s7
-; GCN-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NEXT: s_addc_u32 s7, s7, 0
-; GCN-NEXT: s_mul_i32 s6, s9, s6
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s2, s7, s2
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s2, s10, s2
+; GCN-NEXT: s_addc_u32 s2, s11, s9
+; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s6, s7, s6
; GCN-NEXT: s_add_u32 s2, s2, s6
-; GCN-NEXT: s_addc_u32 s8, 0, s7
-; GCN-NEXT: s_add_u32 s2, s11, s2
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_addc_u32 s6, s9, s8
+; GCN-NEXT: s_addc_u32 s6, 0, s9
+; GCN-NEXT: s_add_u32 s2, s8, s2
+; GCN-NEXT: s_addc_u32 s6, s7, s6
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
; GCN-NEXT: s_mul_i32 s6, s6, 24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s7, v0
; GCN-NEXT: s_add_u32 s6, s8, s6
@@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: s_add_i32 s10, s8, s7
; GCN-NEXT: s_sub_i32 s8, 0, s10
; GCN-NEXT: s_sub_u32 s11, 24, s6
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s9, s6, s7
; GCN-NEXT: s_subb_u32 s12, s8, s5
; GCN-NEXT: s_sub_u32 s13, s11, s4
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
; GCN-NEXT: s_subb_u32 s14, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s14, s5
; GCN-NEXT: s_cselect_b32 s15, -1, 0
@@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_eq_u32 s14, s5
; GCN-NEXT: s_cselect_b32 s15, s16, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s13, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, s12, s5
+; GCN-NEXT: s_sub_u32 s9, s13, s4
+; GCN-NEXT: s_subb_u32 s8, s8, 0
; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s9, s9, s13
; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
; GCN-NEXT: s_subb_u32 s6, 0, s10
@@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s9, s10, s11
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25e91c8..b000fae124ede 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_add_u32 s2, s2, s8
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: s_addc_u32 s3, s3, s9
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
@@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s4, s4, s6
-; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT: s_or_b32 s6, s12, s13
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac80ea55..775483c040b7f 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_sub_u32 s12, s12, s16
; GCN-IR-NEXT: s_subb_u32 s13, s13, s17
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-LABEL: s_test_udiv_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT: s_sub_u32 s6, 0, s2
-; GCN-NEXT: s_subb_u32 s8, 0, s3
+; GCN-NEXT: s_sub_u32 s4, 0, s2
+; GCN-NEXT: s_subb_u32 s5, 0, s3
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-NEXT: v_readfirstlane_b32 s7, v0
+; GCN-NEXT: s_mul_i32 s8, s4, s6
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s5, s7
+; GCN-NEXT: s_mul_i32 s10, s4, s7
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_mul_i32 s12, s7, s8
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s6, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s8
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s7, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT: s_addc_u32 s6, s6, s9
+; GCN-NEXT: s_mul_i32 s9, s4, s6
+; GCN-NEXT: s_mul_i32 s5, s5, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s4, s4, s8
+; GCN-NEXT: s_add_i32 s5, s9, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s5
+; GCN-NEXT: v_readfirstlane_b32 s12, v2
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s4, s10, s4
+; GCN-NEXT: s_addc_u32 s4, s11, s9
; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s5, s6, s5
+; GCN-NEXT: s_add_u32 s4, s4, s5
+; GCN-NEXT: s_addc_u32 s5, 0, s9
+; GCN-NEXT: s_add_u32 s4, s8, s4
+; GCN-NEXT: s_addc_u32 s5, s6, s5
+; GCN-NEXT: v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT: v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT: s_mul_i32 s5, s5, 24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-NEXT: s_mul_i32 s5, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s4
-; GCN-NEXT: s_mul_i32 s11, s6, s4
-; GCN-NEXT: s_add_i32 s5, s12, s5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s5, s5, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT: s_mul_i32 s13, s4, s5
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: s_addc_u32 s11, s14, 0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_add_u32 s5, s10, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s4, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s4, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s5, s6, s11
-; GCN-NEXT: s_add_i32 s4, s4, s8
-; GCN-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s4
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
; GCN-NEXT: s_add_u32 s5, s8, s5
-; GCN-NEXT: s_addc_u32 s5, s10, s6
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_addc_u32 s6, s6, 0
-; GCN-NEXT: s_mul_i32 s4, s9, s4
-; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: s_addc_u32 s6, 0, s6
-; GCN-NEXT: s_add_u32 s8, s11, s4
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s4, s9, s6
-; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT: s_mul_i32 s4, s4, 24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_readfirstlane_b32 s8, v1
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s10, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s10
+; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s11, s1, s0
-; GCN-NEXT: s_sub_i32 s8, 0, s11
-; GCN-NEXT: s_mul_i32 s0, s2, s10
-; GCN-NEXT: s_sub_u32 s12, 24, s0
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_mul_i32 s0, s2, s8
+; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s9, s0, s1
-; GCN-NEXT: s_subb_u32 s13, s8, s3
-; GCN-NEXT: s_sub_u32 s14, s12, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s2
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s12, s11, s2
+; GCN-NEXT: s_subb_u32 s10, s10, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s3
; GCN-NEXT: s_cselect_b32 s13, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s8, s13, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s10, s3
+; GCN-NEXT: s_cselect_b32 s10, s12, s13
+; GCN-NEXT: s_add_u32 s12, s8, 1
; GCN-NEXT: s_addc_u32 s13, 0, 0
-; GCN-NEXT: s_add_u32 s14, s10, 2
+; GCN-NEXT: s_add_u32 s14, s8, 2
; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s14, s9
-; GCN-NEXT: s_cselect_b32 s9, s15, s13
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_cselect_b32 s10, s14, s12
+; GCN-NEXT: s_cselect_b32 s12, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s11
+; GCN-NEXT: s_subb_u32 s0, 0, s9
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s0, s2, s1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s9, 0
-; GCN-NEXT: s_cselect_b32 s1, s8, s10
+; GCN-NEXT: s_cselect_b32 s0, s12, 0
+; GCN-NEXT: s_cselect_b32 s1, s10, s8
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s2, s2, s8
; GCN-IR-NEXT: s_subb_u32 s3, s3, 0
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-IR-NEXT: s_or_b32 s12, s12, s13
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1fe42294..28e6627b87413 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT: s_sub_u32 s10, 0, s8
-; GCN-NEXT: s_subb_u32 s11, 0, s9
+; GCN-NEXT: s_sub_u32 s0, 0, s8
+; GCN-NEXT: s_subb_u32 s1, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s0
-; GCN-NEXT: s_mul_i32 s14, s10, s0
-; GCN-NEXT: s_add_i32 s1, s15, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s1, s1, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s13, s13, s15
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s0, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s1, s2
+; GCN-NEXT: s_mul_i32 s13, s0, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s15, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s15
; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s14, s12, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s0, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s1, s10, s14
-; GCN-NEXT: s_add_i32 s0, s0, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s0
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: v_readfirstlane_b32 s14, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s15, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s0, s10
+; GCN-NEXT: s_mul_i32 s1, s1, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s1, s11, s1
-; GCN-NEXT: s_addc_u32 s1, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s0, s12, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s1, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s0, s0, s11
+; GCN-NEXT: s_add_i32 s1, s12, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s1
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s0, s10, s0
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s0, s13, s0
+; GCN-NEXT: s_addc_u32 s0, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s1, s10, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s0
+; GCN-NEXT: s_addc_u32 s1, s10, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
; GCN-NEXT: v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
; GCN-NEXT: s_subb_u32 s13, s10, s9
; GCN-NEXT: s_sub_u32 s14, s6, s8
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
; GCN-NEXT: s_subb_u32 s15, s13, 0
; GCN-NEXT: s_cmp_ge_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_cmp_eq_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, s17, s16
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_subb_u32 s10, s13, s9
+; GCN-NEXT: s_sub_u32 s11, s14, s8
+; GCN-NEXT: s_subb_u32 s10, s10, 0
; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s11, s11, s14
; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_cmp_lg_u32 s5, 0
; GCN-NEXT: s_cselect_b32 s4, s10, s4
; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-LABEL: s_test_urem_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT: s_sub_u32 s6, 0, s2
-; GCN-NEXT: s_subb_u32 s8, 0, s3
+; GCN-NEXT: s_sub_u32 s4, 0, s2
+; GCN-NEXT: s_subb_u32 s5, 0, s3
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-NEXT: v_readfirstlane_b32 s7, v0
+; GCN-NEXT: s_mul_i32 s8, s4, s6
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s5, s7
+; GCN-NEXT: s_mul_i32 s10, s4, s7
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_mul_i32 s12, s7, s8
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s6, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s8
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s7, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT: s_addc_u32 s6, s6, s9
+; GCN-NEXT: s_mul_i32 s9, s4, s6
+; GCN-NEXT: s_mul_i32 s5, s5, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s4, s4, s8
+; GCN-NEXT: s_add_i32 s5, s9, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s5
+; GCN-NEXT: v_readfirstlane_b32 s12, v2
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s4, s10, s4
+; GCN-NEXT: s_addc_u32 s4, s11, s9
; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s5, s6, s5
+; GCN-NEXT: s_add_u32 s4, s4, s5
+; GCN-NEXT: s_addc_u32 s5, 0, s9
+; GCN-NEXT: s_add_u32 s4, s8, s4
+; GCN-NEXT: s_addc_u32 s5, s6, s5
+; GCN-NEXT: v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT: v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT: s_mul_i32 s5, s5, 24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-NEXT: s_mul_i32 s5, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s4
-; GCN-NEXT: s_mul_i32 s11, s6, s4
-; GCN-NEXT: s_add_i32 s5, s12, s5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s5, s5, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT: s_mul_i32 s13, s4, s5
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: s_addc_u32 s11, s14, 0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_add_u32 s5, s10, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s4, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s4, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s5, s6, s11
-; GCN-NEXT: s_add_i32 s4, s4, s8
-; GCN-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s4
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
; GCN-NEXT: s_add_u32 s5, s8, s5
-; GCN-NEXT: s_addc_u32 s5, s10, s6
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_addc_u32 s6, s6, 0
-; GCN-NEXT: s_mul_i32 s4, s9, s4
-; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: s_addc_u32 s6, 0, s6
-; GCN-NEXT: s_add_u32 s8, s11, s4
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s4, s9, s6
-; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT: s_mul_i32 s4, s4, 24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_readfirstlane_b32 s8, v1
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s8, 0, s5
+; GCN-NEXT: s_addc_u32 s8, 0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
@@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s0, s2, s8
; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s8, s0, s1
; GCN-NEXT: s_subb_u32 s12, s9, s3
; GCN-NEXT: s_sub_u32 s13, s11, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
; GCN-NEXT: s_subb_u32 s14, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s14, s3
; GCN-NEXT: s_cselect_b32 s15, -1, 0
@@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_eq_u32 s14, s3
; GCN-NEXT: s_cselect_b32 s15, s16, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s3
-; GCN-NEXT: s_sub_u32 s16, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, s12, s3
+; GCN-NEXT: s_sub_u32 s9, s13, s2
+; GCN-NEXT: s_subb_u32 s8, s8, 0
; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s9, s9, s13
; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_subb_u32 s0, 0, s10
@@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cselect_b32 s0, s8, s0
; GCN-NEXT: s_cselect_b32 s1, s9, s11
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s8, s8, s10
; GCN-IR-NEXT: s_subb_u32 s9, s9, 0
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-IR-NEXT: s_or_b32 s14, s14, s15
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db6471b6a46..8a54ad301f48a 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_sub_u32 s2, s2, s8
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: s_subb_u32 s3, s3, s9
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
@@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s4, s4, s6
-; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT: s_or_b32 s6, s12, s13
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
>From 8f79d82850d3a423649673beee024e7ce84e98ae Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 27 Oct 2025 11:09:27 -0500
Subject: [PATCH 02/13] Fix mnemonic
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 305c9c40ab726..91df365072521 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10689,7 +10689,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;
- // If s_or_32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
+ // If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
// a register pair) and the input is a 64-bit foldableSelect then transform:
//
// (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64
>From a276310d79c6b6ac5cf8ad096c6ddd9ca3fe2276 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 31 Oct 2025 23:20:59 -0500
Subject: [PATCH 03/13] Fix arguments
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 91df365072521..e37dea66ae031 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10704,8 +10704,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
OrOpnd1.getReg() != OrOpnd2.getReg()) {
auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI);
auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI);
- if (Def1 == Def2 && foldableSelect(Def1))
- optimizeSCC(Def1, Def);
+ if (Def1 == Def2 && foldableSelect(*Def1))
+ optimizeSCC(Def1, Def, RI);
}
}
return true;
>From f261b26a4d99c26e4a7e737e4add771b65d7e931 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 31 Oct 2025 23:29:00 -0500
Subject: [PATCH 04/13] Fix comment
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e37dea66ae031..db6c58cbaabed 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10690,11 +10690,11 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
// If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
- // a register pair) and the input is a 64-bit foldableSelect then transform:
+ // a register pair) and the inputs are the hi and lo-halves of a 64-bit
+ // foldableSelect then transform:
//
- // (s_or_b32 (S_CSELECT_B64 (non-zero imm), 0), 0 => (S_CSELECT_B64
- // (non-zero
- // imm), 0)
+ // (s_or_b32 [hi and lo (S_CSELECT_B64 (non-zero imm), 0)]) =>
+ // (S_CSELECT_B64 (non-zero imm), 0)
if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
MachineOperand OrOpnd1 = Def->getOperand(1);
>From b4f3449a2da8daee663e7f57cc1fc9aea95f2444 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Sat, 1 Nov 2025 13:01:56 -0500
Subject: [PATCH 05/13] Don't copy MachineOperand
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index db6c58cbaabed..27281b1ea1355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10697,8 +10697,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// (S_CSELECT_B64 (non-zero imm), 0)
if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
- MachineOperand OrOpnd1 = Def->getOperand(1);
- MachineOperand OrOpnd2 = Def->getOperand(2);
+ const MachineOperand &OrOpnd1 = Def->getOperand(1);
+ const MachineOperand &OrOpnd2 = Def->getOperand(2);
if (OrOpnd1.isReg() && OrOpnd2.isReg() &&
OrOpnd1.getReg() != OrOpnd2.getReg()) {
>From b3db724b5bbcea25ecaaea5cd3bb533d2907392f Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Sat, 1 Nov 2025 13:08:01 -0500
Subject: [PATCH 06/13] Make comment clear
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 27281b1ea1355..5b420533f6804 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10689,12 +10689,13 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;
- // If s_or_b32 result is unused (i.e. it is effectively a 64-bit s_cmp_lg of
- // a register pair) and the inputs are the hi and lo-halves of a 64-bit
- // foldableSelect then transform:
- //
- // (s_or_b32 [hi and lo (S_CSELECT_B64 (non-zero imm), 0)]) =>
- // (S_CSELECT_B64 (non-zero imm), 0)
+ // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
+ // s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
+ // 64-bit foldableSelect then transform:
+ // s_cselect_b64 sX, (non-zero imm), 0
+ // s_or_b32 sY, hi(sX), lo(sX)
+ // to:
+ // s_cselect_b64 sX, (non-zero imm), 0
if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
const MachineOperand &OrOpnd1 = Def->getOperand(1);
>From 57d9eddbb9e0b595dbd57973b5c02d027a188581 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 3 Nov 2025 10:43:26 -0600
Subject: [PATCH 07/13] Ensure hi/lo halves are or-ed. Add mir tests.
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 28 +++---
llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 90 +++++++++++++++++++
2 files changed, 107 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5b420533f6804..db00619b5cfeb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10691,22 +10691,28 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
// s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
- // 64-bit foldableSelect then transform:
- // s_cselect_b64 sX, (non-zero imm), 0
- // s_or_b32 sY, hi(sX), lo(sX)
- // to:
- // s_cselect_b64 sX, (non-zero imm), 0
+ // 64-bit foldableSelect then delete s_or_b32 in the sequence:
+ // sX = s_cselect_b64 (non-zero imm), 0
+ // sLo = copy sX.sub0
+ // sHi = copy sX.sub1
+ // sY = s_or_b32 sLo, sHi
if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
const MachineOperand &OrOpnd1 = Def->getOperand(1);
const MachineOperand &OrOpnd2 = Def->getOperand(2);
-
- if (OrOpnd1.isReg() && OrOpnd2.isReg() &&
- OrOpnd1.getReg() != OrOpnd2.getReg()) {
- auto *Def1 = getVRegSubRegDef(getRegSubRegPair(OrOpnd1), *MRI);
- auto *Def2 = getVRegSubRegDef(getRegSubRegPair(OrOpnd2), *MRI);
- if (Def1 == Def2 && foldableSelect(*Def1))
+ if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
+ MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg());
+ if (Def1->getOpcode() == AMDGPU::COPY &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ Def2->getOperand(1).isReg() &&
+ Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
+ Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg() &&
+ foldableSelect(
+ *MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()))) {
optimizeSCC(Def1, Def, RI);
+ }
}
}
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c494343b..7538fab3f6069 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2277,3 +2277,93 @@ body: |
S_ENDPGM 0
...
+
+---
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+ %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+# Do not delete s_or_b32 since both operands are sub1.
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %41:sreg_32 = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
>From 919962bc0dca3e4142f4a7fdcc8a551d61878e7d Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Tue, 4 Nov 2025 23:39:44 -0600
Subject: [PATCH 08/13] Add undef testcase
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 41 +++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index 7538fab3f6069..0fe665551b9b6 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2367,3 +2367,44 @@ body: |
S_ENDPGM 0
...
+
+---
+name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
>From 7dbdbe42a6505e51cc380c16ba4928fd24bcd6ca Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Tue, 4 Nov 2025 23:42:18 -0600
Subject: [PATCH 09/13] Fix handling of undef
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index db00619b5cfeb..0c105e631f669 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10703,15 +10703,17 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg());
MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg());
- if (Def1->getOpcode() == AMDGPU::COPY &&
- Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY &&
+ Def2 && Def2->getOpcode() == AMDGPU::COPY &&
+ Def1->getOperand(1).isReg() &&
Def2->getOperand(1).isReg() &&
Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
- Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg() &&
- foldableSelect(
- *MRI->getUniqueVRegDef(Def1->getOperand(1).getReg()))) {
- optimizeSCC(Def1, Def, RI);
+ Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
+ MachineInstr *Select = MRI->getUniqueVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select)) {
+ optimizeSCC(Def1, Def, RI);
+ }
}
}
}
>From 92f73c632a12ada133f085b363df1525b6443b4a Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Tue, 4 Nov 2025 23:55:54 -0600
Subject: [PATCH 10/13] Use getVRegDef
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0c105e631f669..d03a916344ce5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10669,7 +10669,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (CmpValue != 0)
return false;
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
@@ -10701,19 +10701,18 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
const MachineOperand &OrOpnd1 = Def->getOperand(1);
const MachineOperand &OrOpnd2 = Def->getOperand(2);
if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
- MachineInstr *Def1 = MRI->getUniqueVRegDef(OrOpnd1.getReg());
- MachineInstr *Def2 = MRI->getUniqueVRegDef(OrOpnd2.getReg());
- if (Def1 && Def1->getOpcode() == AMDGPU::COPY &&
- Def2 && Def2->getOpcode() == AMDGPU::COPY &&
- Def1->getOperand(1).isReg() &&
+ MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
+ MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
+ if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
+ Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
Def2->getOperand(1).isReg() &&
Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
- MachineInstr *Select = MRI->getUniqueVRegDef(Def1->getOperand(1).getReg());
- if (Select && foldableSelect(*Select)) {
- optimizeSCC(Def1, Def, RI);
- }
+ MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
+ if (Select && foldableSelect(*Select)) {
+ optimizeSCC(Def1, Def, RI);
+ }
}
}
}
@@ -10746,7 +10745,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
- MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ MachineInstr *Def = MRI->getVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
return false;
>From 3f8d2092d54c1dd94f6f19f7318ee9d136df0d5f Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 5 Nov 2025 07:55:29 -0600
Subject: [PATCH 11/13] Use correct instruction for scan start point
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d03a916344ce5..789b4ff17fd76 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10711,7 +10711,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
if (Select && foldableSelect(*Select)) {
- optimizeSCC(Def1, Def, RI);
+ optimizeSCC(Select, Def, RI);
}
}
}
>From d815e12654cff8f1f5fafa9d41be0f1e9be1fe3c Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 5 Nov 2025 08:17:17 -0600
Subject: [PATCH 12/13] Ensure scan points have same parent
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 789b4ff17fd76..529ae48c3b213 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10625,6 +10625,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
const SIRegisterInfo &RI) {
MachineInstr *KillsSCC = nullptr;
+ if (SCCValid->getParent() != SCCRedefine->getParent())
+ return false;
for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
SCCRedefine->getIterator())) {
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
@@ -10670,7 +10672,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
MachineInstr *Def = MRI->getVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ if (!Def)
return false;
// For S_OP that set SCC = DST!=0, do the transformation
@@ -10746,7 +10748,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
MachineInstr *Def = MRI->getVRegDef(SrcReg);
- if (!Def || Def->getParent() != CmpInstr.getParent())
+ if (!Def)
return false;
if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
>From 8889e21f4ad2f4c5e96b7d7a9ef9ebf531af0530 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 5 Nov 2025 08:26:03 -0600
Subject: [PATCH 13/13] Add negative test for intervening scc def
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 47 +++++++++++++++++++
1 file changed, 47 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index 0fe665551b9b6..5b71482439fb7 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2320,6 +2320,53 @@ body: |
bb.2:
S_ENDPGM 0
+...
+---
+# Do not delete s_or_b32 since because of intervening def of scc
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+ %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
...
---
More information about the llvm-commits
mailing list