[llvm] 62d8b8a - Fix 64-bit copy to SCC

Piotr Sobczak via llvm-commits llvm-commits at lists.llvm.org
Sun Aug 9 11:51:15 PDT 2020


Author: Piotr Sobczak
Date: 2020-08-09T20:50:30+02:00
New Revision: 62d8b8a2253c4615723e4fdd92505f25d78c75ee

URL: https://github.com/llvm/llvm-project/commit/62d8b8a2253c4615723e4fdd92505f25d78c75ee
DIFF: https://github.com/llvm/llvm-project/commit/62d8b8a2253c4615723e4fdd92505f25d78c75ee.diff

LOG: Fix 64-bit copy to SCC

Fix 64-bit copy to SCC by restricting the pattern resulting
in such a copy to subtargets supporting 64-bit scalar compare,
and mapping the copy to S_CMP_LG_U64.

Before introducing the S_CSELECT pattern with explicit SCC
(0045786f146e78afee49eee053dc29ebc842fee1), there was no need
for handling 64-bit copy to SCC ($scc = COPY sreg_64).

The proposed handling to read only the low bits was however
based on a false premise that it is only one bit that matters,
while in fact the copy source might be a vector of booleans and
all bits need to be considered.

The practical problem of mapping the 64-bit copy to SCC is that
the natural instruction to use (S_CMP_LG_U64) is not available
on old hardware. Fix it by restricting the problematic pattern
to subtargets supporting the instruction (hasScalarCompareEq64).

Differential Revision: https://reviews.llvm.org/D85207

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
    llvm/test/CodeGen/AMDGPU/fceil64.ll
    llvm/test/CodeGen/AMDGPU/fshl.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/mad_uint24.ll
    llvm/test/CodeGen/AMDGPU/sad.ll
    llvm/test/CodeGen/AMDGPU/sdiv.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/select-opt.ll
    llvm/test/CodeGen/AMDGPU/select-vectors.ll
    llvm/test/CodeGen/AMDGPU/select64.ll
    llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/trunc.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/udivrem.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/vselect.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 52fa3880a7f5..0cd7acb7a789 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -742,15 +742,20 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (DestReg == AMDGPU::SCC) {
     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
     // but SelectionDAG emits such copies for i1 sources.
-    // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
-      SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+      // This copy can only be produced by patterns
+      // with explicit SCC, which are known to be enabled
+      // only for subtargets with S_CMP_LG_U64 present.
+      assert(ST.hasScalarCompareEq64());
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
+    } else {
+      assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0);
     }
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addImm(0);
 
     return;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1bff2fe76c44..db8f3c9185c9 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -469,10 +469,15 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
 } // End isCommutable = 1
 } // End Defs = [SCC]
 
+// This pattern is restricted to certain subtargets (practically GFX8Plus)
+// because isel sometimes produces an sreg_64 copy to SCC as a by-product
+// of this pattern, and only for subtargets with hasScalarCompareEq64
+// is it possible to map such copy to a single instruction (S_CMP_LG_U64).
 class SelectPat<SDPatternOperator select> : PatFrag <
   (ops node:$src1, node:$src2),
   (select SCC, $src1, $src2),
-  [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+  [{ return Subtarget->hasScalarCompareEq64() &&
+            N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
 >;
 
 let Uses = [SCC] in {

diff  --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index eb6c98774ea6..2f01b4879226 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
 ; the global address space(1) uses 64-bit pointers.  These tests check to make sure
@@ -57,9 +57,11 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
-; SI: s_cmp_lg_u32
-; SI-NOT: v_cmp_ne_u32
-; SI: s_cselect_b32
+; GFX7 v_cmp_ne_u32
+; GFX7: v_cndmask_b32
+; GFX8: s_cmp_lg_u32
+; GFX8-NOT: v_cmp_ne_u32
+; GFX8: s_cselect_b32
 define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 5c6b3d32ab93..a63552b91f36 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -148,12 +148,13 @@ define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
-; CI-DAG: v_cmp_ne_u64_e64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
-; CI-DAG: s_cmp_lg_u32 s[[CMP_LO]], 0
 ; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
-; HSA-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
-; HSA-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
+; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
+; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
@@ -167,12 +168,13 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
 ; HSA: enable_sgpr_queue_ptr = 0
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
+; CI-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
+; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
+; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
-; CI-DAG: v_cmp_ne_u64_e64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
-; CI-DAG: s_cmp_lg_u32 s[[CMP_LO]], 0
 ; GFX9-DAG: s_cmp_lg_u64 s{{\[}}[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]{{\]}}, 0
-; HSA-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
-; HSA-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
+; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
+; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
   %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index f7d67b9f465a..c44f5dd6bd59 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -266,13 +266,13 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-NEXT:    s_add_i32 s3, s3, s4
-; GCN-NEXT:    s_xor_b32 s4, s3, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GCN-NEXT:    s_sub_i32 s3, 0, s4
-; GCN-NEXT:    s_ashr_i32 s5, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s5
+; GCN-NEXT:    s_xor_b32 s6, s3, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GCN-NEXT:    s_sub_i32 s3, 0, s6
+; GCN-NEXT:    s_ashr_i32 s4, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s6, s2, s5
+; GCN-NEXT:    s_xor_b32 s5, s2, s4
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -280,17 +280,17 @@ define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = srem i32 %x, %y
@@ -436,15 +436,15 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
@@ -485,24 +485,24 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s5, s4, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GCN-NEXT:    s_sext_i32_i16 s2, s4
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    s_xor_b32 s2, s2, s5
+; GCN-NEXT:    s_ashr_i32 s2, s4, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GCN-NEXT:    s_sext_i32_i16 s3, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GCN-NEXT:    s_xor_b32 s3, s3, s2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s2, s2, 30
-; GCN-NEXT:    s_or_b32 s6, s2, 1
+; GCN-NEXT:    s_ashr_i32 s3, s3, 30
+; GCN-NEXT:    s_or_b32 s3, s3, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s2, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -647,15 +647,15 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
@@ -694,29 +694,29 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; GCN-LABEL: srem_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i32 s0, s2, 0x80008
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-NEXT:    s_sext_i32_i8 s1, s2
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
+; GCN-NEXT:    s_sext_i32_i8 s3, s0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GCN-NEXT:    s_xor_b32 s1, s3, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_lshr_b32 s3, s2, 8
-; GCN-NEXT:    s_or_b32 s6, s0, 1
+; GCN-NEXT:    s_ashr_i32 s1, s1, 30
+; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    s_lshr_b32 s2, s0, 8
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = srem i8 %x, %y
@@ -860,88 +860,87 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GCN-LABEL: udiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s3, 0x4f7ffffe
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s15, 0xf000
 ; GCN-NEXT:    s_mov_b32 s14, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s10
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s11
+; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GCN-NEXT:    s_sub_i32 s2, 0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v3, v1, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GCN-NEXT:    s_sub_i32 s4, 0, s10
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
-; GCN-NEXT:    s_sub_i32 s4, 0, s11
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s9
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s5, v3
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, v1, s9
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, s3, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
+; GCN-NEXT:    s_sub_i32 s0, 0, s11
+; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, s10
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; GCN-NEXT:    v_mul_f32_e32 v4, s12, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, s3, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, s10
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, s7, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, s11
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v4
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v4
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, s11
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
@@ -1077,72 +1076,72 @@ define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GCN-LABEL: urem_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GCN-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s11
+; GCN-NEXT:    s_sub_i32 s2, 0, s8
+; GCN-NEXT:    s_sub_i32 s12, 0, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_sub_i32 s3, 0, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s11
+; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
-; GCN-NEXT:    s_sub_i32 s2, 0, s10
-; GCN-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mul_lo_u32 v4, s12, v1
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GCN-NEXT:    v_mul_f32_e32 v3, s12, v3
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GCN-NEXT:    s_sub_i32 s2, 0, s11
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GCN-NEXT:    s_sub_i32 s4, 0, s10
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GCN-NEXT:    s_sub_i32 s4, 0, s11
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_f32_e32 v3, s13, v4
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, s11
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
@@ -1328,126 +1327,125 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ;
 ; GCN-LABEL: sdiv_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s16, 0x4f7ffffe
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s14, s8, 31
-; GCN-NEXT:    s_add_i32 s2, s8, s14
-; GCN-NEXT:    s_xor_b32 s12, s2, s14
+; GCN-NEXT:    s_ashr_i32 s2, s12, 31
+; GCN-NEXT:    s_add_i32 s3, s12, s2
+; GCN-NEXT:    s_xor_b32 s12, s3, s2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GCN-NEXT:    s_ashr_i32 s8, s9, 31
-; GCN-NEXT:    s_add_i32 s2, s9, s8
-; GCN-NEXT:    s_xor_b32 s15, s2, s8
+; GCN-NEXT:    s_ashr_i32 s3, s13, 31
+; GCN-NEXT:    s_add_i32 s0, s13, s3
+; GCN-NEXT:    s_xor_b32 s13, s0, s3
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GCN-NEXT:    s_sub_i32 s3, 0, s12
-; GCN-NEXT:    s_ashr_i32 s9, s4, 31
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GCN-NEXT:    s_sub_i32 s1, 0, s12
+; GCN-NEXT:    s_ashr_i32 s0, s8, 31
 ; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_add_i32 s2, s4, s9
-; GCN-NEXT:    s_xor_b32 s2, s2, s9
-; GCN-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GCN-NEXT:    s_xor_b32 s2, s0, s2
+; GCN-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GCN-NEXT:    s_add_i32 s1, s8, s0
 ; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_sub_i32 s3, 0, s15
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GCN-NEXT:    s_ashr_i32 s4, s5, 31
-; GCN-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    s_sub_i32 s0, 0, s13
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
+; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
-; GCN-NEXT:    s_add_i32 s2, s5, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GCN-NEXT:    s_xor_b32 s2, s2, s4
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s2, v1
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
-; GCN-NEXT:    s_xor_b32 s0, s9, s14
-; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s15
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GCN-NEXT:    s_ashr_i32 s3, s6, 31
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    s_ashr_i32 s2, s10, 31
-; GCN-NEXT:    s_add_i32 s0, s10, s2
-; GCN-NEXT:    s_xor_b32 s5, s0, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s15, v2
+; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-NEXT:    s_ashr_i32 s0, s9, 31
+; GCN-NEXT:    s_add_i32 s1, s9, s0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    s_xor_b32 s2, s0, s3
+; GCN-NEXT:    s_ashr_i32 s3, s14, 31
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    s_add_i32 s0, s14, s3
+; GCN-NEXT:    s_xor_b32 s9, s0, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GCN-NEXT:    s_sub_i32 s0, 0, s5
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, s13
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT:    s_xor_b32 s1, s4, s8
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GCN-NEXT:    s_sub_i32 s0, 0, s9
 ; GCN-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GCN-NEXT:    s_add_i32 s0, s6, s3
-; GCN-NEXT:    s_xor_b32 s0, s0, s3
-; GCN-NEXT:    s_ashr_i32 s4, s11, 31
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_mul_hi_u32 v2, v3, v5
-; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
+; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
+; GCN-NEXT:    s_ashr_i32 s2, s15, 31
+; GCN-NEXT:    s_ashr_i32 s0, s10, 31
+; GCN-NEXT:    s_add_i32 s8, s15, s2
+; GCN-NEXT:    s_add_i32 s1, s10, s0
+; GCN-NEXT:    s_xor_b32 s8, s8, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; GCN-NEXT:    v_mul_hi_u32 v2, s0, v2
-; GCN-NEXT:    s_mov_b32 s15, 0xf000
-; GCN-NEXT:    s_mov_b32 s14, -1
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, s5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT:    s_add_i32 s0, s11, s4
-; GCN-NEXT:    s_xor_b32 s6, s0, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
+; GCN-NEXT:    v_mul_hi_u32 v2, s1, v2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    s_sub_i32 s0, 0, s6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
+; GCN-NEXT:    s_xor_b32 s3, s0, s3
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, s9
 ; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GCN-NEXT:    v_mul_lo_u32 v6, s0, v4
-; GCN-NEXT:    s_ashr_i32 s0, s7, 31
-; GCN-NEXT:    s_add_i32 s1, s7, s0
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    v_mul_hi_u32 v3, v4, v6
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    s_xor_b32 s2, s0, s4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_hi_u32 v3, s1, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, s6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s1, v4
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v4
+; GCN-NEXT:    s_sub_i32 s0, 0, s8
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
+; GCN-NEXT:    s_ashr_i32 s0, s11, 31
+; GCN-NEXT:    s_add_i32 s1, s11, s0
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
+; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
+; GCN-NEXT:    s_xor_b32 s2, s0, s2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, s1, v4
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, s3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, v4, s8
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv <4 x i32> %x, %y
   store <4 x i32> %r, <4 x i32> addrspace(1)* %out
@@ -1614,111 +1612,111 @@ define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ; GCN-LABEL: srem_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s14, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s2, s8, 31
-; GCN-NEXT:    s_add_i32 s3, s8, s2
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    s_sub_i32 s13, 0, s2
-; GCN-NEXT:    s_ashr_i32 s12, s9, 31
-; GCN-NEXT:    s_add_i32 s9, s9, s12
+; GCN-NEXT:    s_add_i32 s8, s8, s2
+; GCN-NEXT:    s_xor_b32 s12, s8, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GCN-NEXT:    s_ashr_i32 s8, s9, 31
+; GCN-NEXT:    s_add_i32 s9, s9, s8
+; GCN-NEXT:    s_xor_b32 s14, s9, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s9, s9, s12
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_ashr_i32 s3, s4, 31
-; GCN-NEXT:    v_mul_f32_e32 v0, s14, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GCN-NEXT:    s_sub_i32 s9, 0, s12
+; GCN-NEXT:    s_ashr_i32 s8, s4, 31
+; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_add_i32 s4, s4, s3
-; GCN-NEXT:    s_xor_b32 s4, s4, s3
-; GCN-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s14, v1
+; GCN-NEXT:    s_add_i32 s4, s4, s8
+; GCN-NEXT:    s_xor_b32 s4, s4, s8
+; GCN-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_sub_i32 s13, 0, s9
+; GCN-NEXT:    s_sub_i32 s9, 0, s14
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GCN-NEXT:    s_ashr_i32 s12, s10, 31
-; GCN-NEXT:    s_ashr_i32 s8, s5, 31
-; GCN-NEXT:    s_add_i32 s5, s5, s8
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s13, v1
-; GCN-NEXT:    s_xor_b32 s5, s5, s8
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GCN-NEXT:    v_mul_lo_u32 v2, s9, v1
+; GCN-NEXT:    s_ashr_i32 s9, s5, 31
+; GCN-NEXT:    s_add_i32 s5, s5, s9
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GCN-NEXT:    s_add_i32 s2, s10, s12
-; GCN-NEXT:    s_xor_b32 s2, s2, s12
+; GCN-NEXT:    s_xor_b32 s4, s5, s9
+; GCN-NEXT:    s_ashr_i32 s5, s10, 31
+; GCN-NEXT:    s_add_i32 s10, s10, s5
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GCN-NEXT:    s_xor_b32 s10, s10, s5
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GCN-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GCN-NEXT:    s_sub_i32 s3, 0, s2
-; GCN-NEXT:    s_ashr_i32 s4, s6, 31
-; GCN-NEXT:    v_mul_f32_e32 v2, s14, v2
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, s14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GCN-NEXT:    v_mul_f32_e32 v2, s13, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v2
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GCN-NEXT:    s_sub_i32 s4, 0, s10
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_mul_hi_u32 v3, v2, v4
-; GCN-NEXT:    s_ashr_i32 s5, s11, 31
-; GCN-NEXT:    s_add_i32 s3, s6, s4
-; GCN-NEXT:    s_add_i32 s6, s11, s5
-; GCN-NEXT:    s_xor_b32 s5, s6, s5
+; GCN-NEXT:    s_ashr_i32 s4, s6, 31
+; GCN-NEXT:    s_add_i32 s5, s6, s4
+; GCN-NEXT:    s_ashr_i32 s6, s11, 31
+; GCN-NEXT:    s_add_i32 s8, s11, s6
+; GCN-NEXT:    s_xor_b32 s8, s8, s6
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GCN-NEXT:    s_xor_b32 s3, s3, s4
-; GCN-NEXT:    v_mul_hi_u32 v2, s3, v2
-; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
+; GCN-NEXT:    s_xor_b32 s5, s5, s4
+; GCN-NEXT:    v_mul_hi_u32 v2, s5, v2
+; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s8, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GCN-NEXT:    s_ashr_i32 s6, s7, 31
-; GCN-NEXT:    v_mul_f32_e32 v3, s14, v3
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
+; GCN-NEXT:    v_mul_f32_e32 v3, s13, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v2
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v2
-; GCN-NEXT:    s_sub_i32 s3, 0, s5
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v2
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GCN-NEXT:    s_sub_i32 s5, 0, s8
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_mul_hi_u32 v4, v3, v5
-; GCN-NEXT:    s_add_i32 s2, s7, s6
-; GCN-NEXT:    s_xor_b32 s7, s2, s6
-; GCN-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_lo_u32 v4, s5, v3
+; GCN-NEXT:    s_ashr_i32 s5, s7, 31
+; GCN-NEXT:    s_add_i32 s6, s7, s5
+; GCN-NEXT:    s_xor_b32 s6, s6, s5
+; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, v3, s8
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, s5
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, s6, v3
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
+; GCN-NEXT:    v_xor_b32_e32 v3, s5, v3
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = srem <4 x i32> %x, %y
@@ -2156,71 +2154,71 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GCN-NEXT:    s_xor_b32 s8, s9, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_ashr_i32 s2, s2, 16
 ; GCN-NEXT:    s_ashr_i32 s8, s8, 30
-; GCN-NEXT:    s_or_b32 s10, s8, 1
+; GCN-NEXT:    s_or_b32 s8, s8, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s10, 0
-; GCN-NEXT:    s_ashr_i32 s2, s2, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GCN-NEXT:    s_ashr_i32 s0, s0, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    s_ashr_i32 s0, s0, 16
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GCN-NEXT:    s_xor_b32 s0, s0, s2
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_sext_i32_i16 s2, s3
-; GCN-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_sext_i32_i16 s0, s3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s8, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT:    s_sext_i32_i16 s0, s1
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-NEXT:    s_xor_b32 s0, s0, s2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GCN-NEXT:    s_sext_i32_i16 s2, s1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    s_xor_b32 s0, s2, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mad_f32 v1, -v4, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
+; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
+; GCN-NEXT:    s_ashr_i32 s0, s3, 16
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    s_ashr_i32 s2, s3, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v4
-; GCN-NEXT:    s_ashr_i32 s0, s1, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v0
-; GCN-NEXT:    s_xor_b32 s0, s0, s2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-NEXT:    s_ashr_i32 s1, s1, 16
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v5
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    s_mov_b32 s0, 0xffff
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
-; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
@@ -2351,15 +2349,15 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GCN-NEXT:    s_xor_b32 s8, s9, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s8, s8, 30
-; GCN-NEXT:    s_or_b32 s10, s8, 1
+; GCN-NEXT:    s_or_b32 s8, s8, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s10, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s8, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GCN-NEXT:    s_ashr_i32 s2, s2, 16
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
@@ -2369,15 +2367,15 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GCN-NEXT:    s_xor_b32 s8, s0, s2
 ; GCN-NEXT:    s_ashr_i32 s8, s8, 30
-; GCN-NEXT:    s_or_b32 s10, s8, 1
+; GCN-NEXT:    s_or_b32 s8, s8, 1
 ; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v1|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s10, 0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s8, v3
+; GCN-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, s2
 ; GCN-NEXT:    s_sext_i32_i16 s2, s3
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
@@ -2391,33 +2389,33 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
-; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v2|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v4
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s3, 16
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
+; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
-; GCN-NEXT:    s_ashr_i32 s8, s1, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s8
-; GCN-NEXT:    s_xor_b32 s2, s8, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-NEXT:    s_ashr_i32 s2, s1, 16
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
-; GCN-NEXT:    s_ashr_i32 s2, s2, 30
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GCN-NEXT:    s_or_b32 s9, s2, 1
+; GCN-NEXT:    s_xor_b32 s3, s2, s0
+; GCN-NEXT:    s_ashr_i32 s3, s3, 30
 ; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v4|, |v2|
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s2, s9, 0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s2, v5
+; GCN-NEXT:    s_or_b32 s3, s3, 1
+; GCN-NEXT:    v_mov_b32_e32 v6, s3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
 ; GCN-NEXT:    s_mov_b32 s0, 0xffff
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s8, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -2571,15 +2569,15 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -2619,29 +2617,29 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; GCN-LABEL: srem_i3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i32 s0, s2, 0x30008
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-NEXT:    s_bfe_i32 s1, s2, 0x30000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
+; GCN-NEXT:    s_bfe_i32 s3, s0, 0x30000
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GCN-NEXT:    s_xor_b32 s1, s3, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_lshr_b32 s3, s2, 8
-; GCN-NEXT:    s_or_b32 s6, s0, 1
+; GCN-NEXT:    s_ashr_i32 s1, s1, 30
+; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    s_lshr_b32 s2, s0, 8
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -2992,54 +2990,54 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GCN-NEXT:    s_xor_b32 s8, s9, s8
-; GCN-NEXT:    s_ashr_i32 s8, s8, 30
+; GCN-NEXT:    s_ashr_i32 s0, s0, 16
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s10, s8, 1
-; GCN-NEXT:    s_sext_i32_i16 s1, s1
+; GCN-NEXT:    s_ashr_i32 s8, s8, 30
+; GCN-NEXT:    s_or_b32 s8, s8, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s10, 0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 16
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_ashr_i32 s2, s2, 16
-; GCN-NEXT:    s_xor_b32 s0, s2, s0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s8, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GCN-NEXT:    s_xor_b32 s0, s2, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
-; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    s_sext_i32_i16 s0, s1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v0|
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s0, v3
-; GCN-NEXT:    s_sext_i32_i16 s0, s3
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
+; GCN-NEXT:    s_sext_i32_i16 s1, s3
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
-; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -3144,19 +3142,20 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GCN-NEXT:    s_ashr_i32 s6, s6, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s6, s6, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s6, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s6, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
 ; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
 ; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
@@ -3174,7 +3173,7 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
 ; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    s_sext_i32_i16 s2, s3
 ; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
@@ -3186,13 +3185,12 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v4|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v5
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
+; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
@@ -3566,51 +3564,51 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GCN-NEXT:    s_xor_b32 s1, s1, s3
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 30
-; GCN-NEXT:    s_or_b32 s1, s1, 1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v2|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
-; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
-; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s1, v4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf000f
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v2|
-; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s0, v5
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mad_f32 v5, -v1, v2, v5
+; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v3
-; GCN-NEXT:    v_and_b32_e32 v3, s0, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v3, s0, v3
 ; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -3715,52 +3713,52 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
 ; GCN-NEXT:    s_movk_i32 s3, 0x7fff
-; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
-; GCN-NEXT:    s_and_b32 s1, s0, s3
-; GCN-NEXT:    s_bfe_i32 s1, s1, 0xf0000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s1
-; GCN-NEXT:    s_and_b32 s8, s2, s3
-; GCN-NEXT:    s_bfe_i32 s8, s8, 0xf0000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s8
+; GCN-NEXT:    s_and_b32 s11, s0, s3
+; GCN-NEXT:    s_bfe_i32 s11, s11, 0xf0000
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s11
+; GCN-NEXT:    s_and_b32 s9, s2, s3
+; GCN-NEXT:    s_bfe_i32 s9, s9, 0xf0000
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GCN-NEXT:    s_xor_b32 s1, s8, s1
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
-; GCN-NEXT:    s_lshr_b32 s10, s2, 15
+; GCN-NEXT:    s_xor_b32 s9, s9, s11
+; GCN-NEXT:    s_ashr_i32 s9, s9, 30
+; GCN-NEXT:    s_or_b32 s9, s9, 1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    s_bfe_u32 s11, s2, 0xf000f
-; GCN-NEXT:    s_lshr_b32 s12, s0, 15
-; GCN-NEXT:    s_bfe_u32 s13, s0, 0xf000f
-; GCN-NEXT:    s_or_b32 s1, s1, 1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v2|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s1, v4
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    s_bfe_u32 s12, s0, 0xf000f
+; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
-; GCN-NEXT:    s_bfe_i32 s0, s13, 0xf0000
+; GCN-NEXT:    s_lshr_b32 s1, s0, 15
+; GCN-NEXT:    s_bfe_i32 s0, s12, 0xf0000
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GCN-NEXT:    s_bfe_i32 s1, s11, 0xf0000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GCN-NEXT:    s_lshr_b32 s8, s2, 15
+; GCN-NEXT:    s_bfe_i32 s2, s10, 0xf0000
+; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GCN-NEXT:    s_xor_b32 s0, s2, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
+; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GCN-NEXT:    v_bfe_i32 v4, v1, 0, 15
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v5
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
 ; GCN-NEXT:    v_bfe_i32 v6, v0, 0, 15
@@ -3775,11 +3773,11 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v6
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, s12
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
 ; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
@@ -4004,53 +4002,53 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s4, 0x1000
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s5, s4, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    s_lshl_b32 s10, s4, s3
-; GCN-NEXT:    s_mov_b32 s3, 0x4f7ffffe
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GCN-NEXT:    s_lshl_b32 s8, s4, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GCN-NEXT:    s_lshl_b32 s9, s4, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_sub_i32 s2, 0, s5
+; GCN-NEXT:    s_mov_b32 s0, 0x4f7ffffe
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, s0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GCN-NEXT:    s_sub_i32 s0, 0, s8
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GCN-NEXT:    s_sub_i32 s2, 0, s10
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s0, v0
+; GCN-NEXT:    s_sub_i32 s0, 0, s9
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, s5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s10
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT:    v_mul_hi_u32 v1, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s9
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -4235,50 +4233,50 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s4, 0x1000
-; GCN-NEXT:    s_mov_b32 s7, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s4, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    s_lshl_b32 s6, s4, s3
-; GCN-NEXT:    s_sub_i32 s3, 0, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GCN-NEXT:    s_lshl_b32 s8, s4, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GCN-NEXT:    s_lshl_b32 s3, s4, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_mov_b32 s4, 0x4f7ffffe
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_sub_i32 s2, 0, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, s7, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s7, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GCN-NEXT:    s_sub_i32 s3, 0, s6
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
+; GCN-NEXT:    s_sub_i32 s2, 0, s3
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s3, v1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
@@ -4347,40 +4345,40 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GCN-LABEL: sdiv_i32_pow2_shl_denom:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GCN-NEXT:    s_ashr_i32 s8, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s8
-; GCN-NEXT:    s_xor_b32 s9, s3, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-NEXT:    s_sub_i32 s3, 0, s9
-; GCN-NEXT:    s_ashr_i32 s0, s2, 31
-; GCN-NEXT:    s_add_i32 s1, s2, s0
+; GCN-NEXT:    s_ashr_i32 s4, s3, 31
+; GCN-NEXT:    s_add_i32 s3, s3, s4
+; GCN-NEXT:    s_xor_b32 s7, s3, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GCN-NEXT:    s_sub_i32 s3, 0, s7
+; GCN-NEXT:    s_ashr_i32 s5, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    s_xor_b32 s2, s0, s8
+; GCN-NEXT:    s_xor_b32 s6, s2, s5
+; GCN-NEXT:    s_xor_b32 s4, s5, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
+; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s7
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -4550,68 +4548,68 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GCN-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GCN-NEXT:    s_movk_i32 s6, 0x1000
-; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
+; GCN-NEXT:    s_movk_i32 s10, 0x1000
+; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s6, s2
-; GCN-NEXT:    s_ashr_i32 s10, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s10
-; GCN-NEXT:    s_xor_b32 s11, s2, s10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s11
-; GCN-NEXT:    s_sub_i32 s1, 0, s11
-; GCN-NEXT:    s_lshl_b32 s0, s6, s3
-; GCN-NEXT:    s_ashr_i32 s3, s0, 31
+; GCN-NEXT:    s_lshl_b32 s2, s10, s2
+; GCN-NEXT:    s_ashr_i32 s11, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s11
+; GCN-NEXT:    s_xor_b32 s12, s2, s11
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GCN-NEXT:    s_lshl_b32 s0, s10, s3
+; GCN-NEXT:    s_sub_i32 s3, 0, s12
+; GCN-NEXT:    s_ashr_i32 s2, s0, 31
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_add_i32 s0, s0, s3
-; GCN-NEXT:    s_xor_b32 s13, s0, s3
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s13
-; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s10, s0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    s_ashr_i32 s2, s8, 31
-; GCN-NEXT:    s_add_i32 s0, s8, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, s1, v0
+; GCN-NEXT:    s_ashr_i32 s1, s8, 31
+; GCN-NEXT:    s_add_i32 s0, s8, s1
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    s_xor_b32 s2, s2, s10
+; GCN-NEXT:    s_xor_b32 s3, s1, s11
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s12, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, s13, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s11
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s11, v2
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GCN-NEXT:    s_sub_i32 s0, 0, s13
-; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
+; GCN-NEXT:    s_sub_i32 s0, 0, s10
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
 ; GCN-NEXT:    s_ashr_i32 s0, s9, 31
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v4
 ; GCN-NEXT:    s_add_i32 s1, s9, s0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    s_xor_b32 s1, s1, s0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-NEXT:    s_xor_b32 s2, s0, s2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    s_xor_b32 s2, s0, s3
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s13
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, s10
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s13, v2
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
@@ -4692,13 +4690,13 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GCN-NEXT:    s_ashr_i32 s4, s3, 31
 ; GCN-NEXT:    s_add_i32 s3, s3, s4
-; GCN-NEXT:    s_xor_b32 s4, s3, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GCN-NEXT:    s_sub_i32 s3, 0, s4
-; GCN-NEXT:    s_ashr_i32 s5, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s5
+; GCN-NEXT:    s_xor_b32 s6, s3, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GCN-NEXT:    s_sub_i32 s3, 0, s6
+; GCN-NEXT:    s_ashr_i32 s4, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s6, s2, s5
+; GCN-NEXT:    s_xor_b32 s5, s2, s4
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -4706,17 +4704,17 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
@@ -4849,65 +4847,65 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s6, 0x1000
-; GCN-NEXT:    s_mov_b32 s7, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s10, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s2, s6, s2
 ; GCN-NEXT:    s_ashr_i32 s4, s2, 31
 ; GCN-NEXT:    s_add_i32 s2, s2, s4
-; GCN-NEXT:    s_xor_b32 s2, s2, s4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    s_lshl_b32 s3, s6, s3
-; GCN-NEXT:    s_ashr_i32 s6, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s6
+; GCN-NEXT:    s_xor_b32 s9, s2, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GCN-NEXT:    s_lshl_b32 s2, s6, s3
+; GCN-NEXT:    s_ashr_i32 s6, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s6
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s3, s3, s6
-; GCN-NEXT:    s_sub_i32 s6, 0, s2
+; GCN-NEXT:    s_sub_i32 s8, 0, s9
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GCN-NEXT:    v_mul_f32_e32 v0, s7, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, s10, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s8, s0, 31
-; GCN-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    s_add_i32 s0, s0, s8
-; GCN-NEXT:    s_xor_b32 s0, s0, s8
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GCN-NEXT:    v_mul_f32_e32 v1, s7, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_sub_i32 s6, 0, s3
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GCN-NEXT:    s_ashr_i32 s9, s1, 31
-; GCN-NEXT:    s_add_i32 s1, s1, s9
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_ashr_i32 s3, s0, 31
+; GCN-NEXT:    s_add_i32 s0, s0, s3
+; GCN-NEXT:    v_mul_lo_u32 v1, s8, v0
+; GCN-NEXT:    s_xor_b32 s8, s2, s6
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GCN-NEXT:    s_xor_b32 s0, s0, s3
+; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GCN-NEXT:    s_sub_i32 s2, 0, s8
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s10, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
+; GCN-NEXT:    s_ashr_i32 s0, s1, 31
+; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
+; GCN-NEXT:    s_add_i32 s1, s1, s0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    s_xor_b32 s0, s1, s9
+; GCN-NEXT:    s_xor_b32 s1, s1, s0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mul_hi_u32 v1, s0, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, s8
+; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
+; GCN-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -4930,121 +4928,121 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s2, 0xfee0
 ; GCN-NEXT:    s_mov_b32 s3, 0x68958c89
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_movk_i32 s8, 0x11f
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    s_movk_i32 s9, 0x11e
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x11e
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, s3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v8, v2, s3
+; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
 ; GCN-NEXT:    s_mov_b32 s2, 0x976a7377
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, s3
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GCN-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v5
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GCN-NEXT:    s_movk_i32 s3, 0x11f
+; GCN-NEXT:    s_mov_b32 s9, s5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s3
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
-; GCN-NEXT:    v_mov_b32_e32 v5, s8
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v4
-; GCN-NEXT:    s_mov_b32 s10, 0x976a7376
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v6, s11
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v2
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
+; GCN-NEXT:    s_mov_b32 s2, 0x976a7376
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v6, s7
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v3
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, i64 addrspace(1)* %out
@@ -5219,7 +5217,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
-; GCN-NEXT:    s_movk_i32 s8, 0xffe
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
@@ -5234,24 +5231,25 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v3
+; GCN-NEXT:    s_movk_i32 s0, 0xffe
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -5313,120 +5311,120 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s2, 0xfee0
 ; GCN-NEXT:    s_mov_b32 s3, 0x689e0837
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_movk_i32 s8, 0x11f
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GCN-NEXT:    s_mov_b32 s12, 0x9761f7c9
+; GCN-NEXT:    s_movk_i32 s12, 0x11f
+; GCN-NEXT:    s_mov_b32 s13, 0x9761f7c9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    s_movk_i32 s9, 0x11e
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s9, s5
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GCN-NEXT:    s_movk_i32 s5, 0x11e
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, s3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v8, v2, s3
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, v0, s3
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GCN-NEXT:    v_mul_lo_u32 v8, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v5
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
+; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GCN-NEXT:    s_mov_b32 s4, 0x9761f7c8
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, s12
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT:    v_mul_lo_u32 v1, v1, s13
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s13
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
-; GCN-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v5
-; GCN-NEXT:    s_mov_b32 s10, 0x9761f7c8
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
+; GCN-NEXT:    v_mov_b32_e32 v3, s12
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, i64 addrspace(1)* %out
@@ -5659,30 +5657,30 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    s_mov_b32 s3, 0x12d8fa
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
+; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -5743,28 +5741,26 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_add_u32 s2, s2, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    s_addc_u32 s3, s3, s12
-; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[12:13]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GCN-NEXT:    s_sub_u32 s2, 0, s14
-; GCN-NEXT:    s_subb_u32 s3, 0, s15
-; GCN-NEXT:    s_ashr_i32 s16, s11, 31
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
+; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s17, s16
+; GCN-NEXT:    s_mov_b32 s15, s14
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -5785,11 +5781,12 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
@@ -5807,10 +5804,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s10, s16
-; GCN-NEXT:    s_addc_u32 s1, s11, s16
+; GCN-NEXT:    s_add_u32 s0, s10, s14
+; GCN-NEXT:    s_addc_u32 s1, s11, s14
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
@@ -5821,47 +5818,48 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s14, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s11
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[16:17], s[12:13]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -6019,30 +6017,30 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    s_movk_i32 s9, 0xffe
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
+; GCN-NEXT:    s_movk_i32 s0, 0xffe
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
@@ -6076,8 +6074,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
 ; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_movk_i32 s2, 0x1000
-; GCN-NEXT:    s_mov_b32 s20, 0x4f800000
-; GCN-NEXT:    s_mov_b32 s21, 0x5f7ffffc
+; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
+; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
@@ -6088,28 +6086,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GCN-NEXT:    s_mov_b32 s22, 0x2f800000
-; GCN-NEXT:    s_mov_b32 s23, 0xcf800000
+; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
+; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
 ; GCN-NEXT:    s_sub_u32 s6, 0, s14
-; GCN-NEXT:    v_mac_f32_e32 v0, s20, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_subb_u32 s7, 0, s15
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    v_mul_f32_e32 v0, s21, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s22, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, s23, v1
+; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s18, s9, 31
-; GCN-NEXT:    s_add_u32 s0, s8, s18
 ; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
-; GCN-NEXT:    s_mov_b32 s19, s18
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -6121,8 +6115,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    s_addc_u32 s1, s9, s18
-; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[18:19]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -6155,7 +6147,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i32 s2, s9, 31
+; GCN-NEXT:    s_add_u32 s0, s8, s2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_addc_u32 s1, s9, s2
+; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
@@ -6166,6 +6164,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
@@ -6180,71 +6179,68 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s8, v3
-; GCN-NEXT:    v_subb_u32_e64 v5, vcc, v5, v7, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, s14, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v8, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
+; GCN-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
+; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    s_ashr_i32 s8, s13, 31
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GCN-NEXT:    s_add_u32 s12, s12, s8
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v8, s9
-; GCN-NEXT:    s_xor_b64 s[8:9], s[18:19], s[16:17]
-; GCN-NEXT:    s_ashr_i32 s16, s13, 31
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v8, v2, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s12, s16
-; GCN-NEXT:    s_mov_b32 s17, s16
-; GCN-NEXT:    s_addc_u32 s1, s13, s16
-; GCN-NEXT:    s_xor_b64 s[12:13], s[0:1], s[16:17]
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_addc_u32 s13, s13, s8
+; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
-; GCN-NEXT:    v_mac_f32_e32 v10, s20, v11
+; GCN-NEXT:    v_mac_f32_e32 v10, s18, v11
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v10
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[2:3]
-; GCN-NEXT:    v_mul_f32_e32 v3, s21, v3
-; GCN-NEXT:    v_mul_f32_e32 v5, s22, v3
+; GCN-NEXT:    s_sub_u32 s14, 0, s12
+; GCN-NEXT:    v_mul_f32_e32 v3, s19, v3
+; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_mac_f32_e32 v3, s23, v5
+; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    s_sub_u32 s2, 0, s12
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
-; GCN-NEXT:    s_subb_u32 s3, 0, s13
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
-; GCN-NEXT:    s_ashr_i32 s14, s11, 31
+; GCN-NEXT:    v_mul_hi_u32 v2, s14, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s14, v5
+; GCN-NEXT:    s_subb_u32 s15, 0, s13
+; GCN-NEXT:    v_mul_lo_u32 v8, s15, v3
+; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s14, v3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
 ; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
-; GCN-NEXT:    s_mov_b32 s15, s14
+; GCN-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
@@ -6252,11 +6248,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
-; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
+; GCN-NEXT:    v_mul_lo_u32 v8, s14, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, s14, v2
+; GCN-NEXT:    v_mul_lo_u32 v10, s15, v2
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
+; GCN-NEXT:    v_mul_lo_u32 v9, s14, v2
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
 ; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
@@ -6273,9 +6269,11 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
 ; GCN-NEXT:    s_add_u32 s0, s10, s14
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    s_mov_b32 s15, s14
 ; GCN-NEXT:    s_addc_u32 s1, s11, s14
 ; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
@@ -6288,7 +6286,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
 ; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT:    v_mov_b32_e32 v8, s9
+; GCN-NEXT:    v_mov_b32_e32 v8, s3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
@@ -6296,32 +6294,32 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
 ; GCN-NEXT:    v_mov_b32_e32 v7, s13
-; GCN-NEXT:    v_sub_i32_e64 v5, s[0:1], s10, v5
-; GCN-NEXT:    v_subb_u32_e64 v6, vcc, v6, v7, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, s12, v5
-; GCN-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v2
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
-; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v3, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
+; GCN-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
+; GCN-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
+; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v8, s11
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v8, v4, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
@@ -6329,9 +6327,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
@@ -6440,28 +6438,28 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
-; GCN-NEXT:    s_mov_b32 s3, 0x12d8fa
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
+; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -6613,25 +6611,25 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
@@ -6640,7 +6638,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
@@ -6820,43 +6818,43 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s8, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[2:3], s16, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v7, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s16, v5
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GCN-NEXT:    s_ashr_i32 s2, s15, 31
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GCN-NEXT:    s_add_u32 s8, s14, s2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v7, s9
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v7, v1, s[0:1]
-; GCN-NEXT:    s_ashr_i32 s0, s15, 31
-; GCN-NEXT:    s_add_u32 s8, s14, s0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_addc_u32 s9, s15, s0
-; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s9
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_addc_u32 s9, s15, s2
+; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GCN-NEXT:    v_mac_f32_e32 v8, s18, v9
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GCN-NEXT:    v_rcp_f32_e32 v8, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
-; GCN-NEXT:    v_mac_f32_e32 v9, s18, v10
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GCN-NEXT:    v_rcp_f32_e32 v8, v9
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
 ; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
 ; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
@@ -6937,30 +6935,30 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_sub_i32_e64 v2, s[0:1], s10, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
 ; GCN-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v6, s[2:3], s8, v2
-; GCN-NEXT:    v_subbrev_u32_e64 v7, vcc, 0, v4, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v6
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
+; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v7, s11
-; GCN-NEXT:    v_subb_u32_e64 v3, vcc, v7, v3, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
@@ -6969,7 +6967,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 31671b9a1576..666234b90235 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -25,10 +25,10 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_flbit_i32_b32 s0, s2
-; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cselect_b32 s0, s0, 32
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -384,13 +384,14 @@ define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32],
 ; SI-NEXT:    s_flbit_i32_b32 s0, s2
 ; SI-NEXT:    s_flbit_i32_b32 s1, s3
 ; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    s_cmp_eq_u32 s3, 0
-; SI-NEXT:    s_cselect_b32 s0, s0, s1
-; SI-NEXT:    s_or_b32 s1, s2, s3
-; SI-NEXT:    s_cmp_lg_u32 s1, 0
-; SI-NEXT:    s_cselect_b32 s0, s0, 64
+; SI-NEXT:    s_or_b32 s2, s2, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -442,17 +443,18 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_flbit_i32_b32 s0, s2
 ; SI-NEXT:    s_flbit_i32_b32 s1, s3
 ; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    s_cmp_eq_u32 s3, 0
-; SI-NEXT:    s_cselect_b32 s0, s0, s1
-; SI-NEXT:    s_or_b32 s1, s2, s3
-; SI-NEXT:    s_cmp_lg_u32 s1, 0
-; SI-NEXT:    s_cselect_b32 s0, s0, 64
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    s_or_b32 s2, s2, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s2, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index a34fb5c503d6..0fbf68f33607 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -99,13 +99,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; GCN-DAG: s_cmp_eq_u32 s[[HI]], 0{{$}}
+; SI-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
+; VI-DAG: s_cmp_eq_u32 s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
-; GCN-DAG: s_cselect_b32 [[RES:s[0-9]+]], [[ADD]], [[FFBH_HI]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
+; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
+; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; VI-DAG: s_cselect_b32 [[RES:s[0-9]+]], [[ADD]], [[FFBH_HI]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[CTLZ:[0-9]+]], [[RES]]
+; VI-DAG: v_mov_b32_e32 v[[CTLZ:[0-9]+]], [[RES]]
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)

diff  --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index 8dd0f46c9303..1165cd82a299 100644
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -147,26 +147,23 @@ define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)*
 ; GCN-LABEL: no_extract_volatile_load_dynextract:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s12, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_load_dword s12, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s10, s2
 ; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_mov_b32 s8, s6
 ; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NEXT:    s_cmp_eq_u32 s12, 1
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s12, 2
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s12, 3
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 3
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index f0cd6a1df9ef..da852af3f230 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -27,7 +27,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI-DAG: v_cmp_gt_f64
 ; SI-DAG: v_cmp_lg_f64
 ; SI-DAG: v_cndmask_b32
-; SI: s_cselect_b32
+; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
 define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {

diff  --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 9f3255f4e500..2ecce1807921 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -18,12 +18,11 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_sub_i32 s3, 32, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    s_and_b32 s1, s2, 31
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_cmp_eq_u32 s1, 0
+; SI-NEXT:    s_and_b32 s1, s2, 31
 ; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -153,21 +152,19 @@ define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
 ; SI-NEXT:    s_sub_i32 s10, 32, s1
-; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_mov_b32_e32 v1, s10
-; SI-NEXT:    s_cmp_eq_u32 s1, 0
+; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_alignbit_b32 v0, s3, v0, v1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    s_sub_i32 s1, 32, s0
-; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT:    s_cmp_eq_u32 s0, 0
+; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v2, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v2, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -336,39 +333,35 @@ define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    s_sub_i32 s16, 32, s3
-; SI-NEXT:    s_and_b32 s3, s3, 31
 ; SI-NEXT:    v_mov_b32_e32 v1, s16
-; SI-NEXT:    s_cmp_eq_u32 s3, 0
+; SI-NEXT:    s_and_b32 s3, s3, 31
 ; SI-NEXT:    v_alignbit_b32 v0, s11, v0, v1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
 ; SI-NEXT:    s_sub_i32 s3, 32, s2
-; SI-NEXT:    s_and_b32 s2, s2, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; SI-NEXT:    s_cmp_eq_u32 s2, 0
+; SI-NEXT:    s_and_b32 s2, s2, 31
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_alignbit_b32 v0, s10, v0, v1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s10
 ; SI-NEXT:    s_sub_i32 s2, 32, s1
-; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; SI-NEXT:    s_cmp_eq_u32 s1, 0
+; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_alignbit_b32 v0, s9, v0, v1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
 ; SI-NEXT:    s_sub_i32 s1, 32, s0
-; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-NEXT:    s_cmp_eq_u32 s0, 0
+; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v4, s1
 ; SI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index fa34e42f22b4..768d25ee06ff 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -140,16 +140,14 @@ define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x,
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
 ; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    s_cmp_eq_u32 s1, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; SI-NEXT:    s_cmp_eq_u32 s0, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    v_alignbit_b32 v2, s2, v0, v2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -309,30 +307,26 @@ define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x,
 ; SI-NEXT:    v_mov_b32_e32 v0, s15
 ; SI-NEXT:    s_and_b32 s3, s3, 31
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    s_cmp_eq_u32 s3, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_and_b32 s2, s2, 31
 ; SI-NEXT:    v_alignbit_b32 v1, s11, v0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; SI-NEXT:    s_and_b32 s2, s2, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
-; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s14
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_alignbit_b32 v1, s10, v0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; SI-NEXT:    s_and_b32 s1, s1, 31
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
-; SI-NEXT:    s_cmp_eq_u32 s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_and_b32 s0, s0, 31
 ; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
-; SI-NEXT:    s_cmp_eq_u32 s0, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_mov_b32_e32 v4, s0
 ; SI-NEXT:    v_alignbit_b32 v4, s8, v0, v4
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 817e3e5ca28c..e5e67b1022d6 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -579,12 +579,12 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s4, 1
-; SI-NEXT:    s_cselect_b32 s5, s7, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b32 s4, s6, 5
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -618,15 +618,15 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s4, 2
-; SI-NEXT:    s_cselect_b32 s5, s10, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 1
-; SI-NEXT:    s_cselect_b32 s6, s9, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b32 s4, s8, 5
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -664,44 +664,45 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b32 s5, s4, s11
-; SI-NEXT:    s_cmp_eq_u32 s6, 2
-; SI-NEXT:    s_cselect_b32 s7, s4, s10
-; SI-NEXT:    s_cmp_eq_u32 s6, 1
-; SI-NEXT:    s_cselect_b32 s9, s4, s9
-; SI-NEXT:    s_cmp_eq_u32 s6, 0
-; SI-NEXT:    s_cselect_b32 s4, s4, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s7
-; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
+; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:        s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT:        s_load_dwordx4 s[8:11], s[4:5], 0x10
-; VI-NEXT:        s_load_dword s6, s[4:5], 0x20
-; VI-NEXT:        s_load_dword s4, s[4:5], 0x44
-; VI-NEXT:        s_mov_b32 s3, 0x1100f000
-; VI-NEXT:        s_mov_b32 s2, -1
-; VI-NEXT:        s_waitcnt lgkmcnt(0)
-; VI-NEXT:        s_cmp_eq_u32 s6, 3
-; VI-NEXT:        s_cselect_b32 s5, s4, s11
-; VI-NEXT:        s_cmp_eq_u32 s6, 2
-; VI-NEXT:        s_cselect_b32 s7, s4, s10
-; VI-NEXT:        s_cmp_eq_u32 s6, 1
-; VI-NEXT:        s_cselect_b32 s9, s4, s9
-; VI-NEXT:        s_cmp_eq_u32 s6, 0
-; VI-NEXT:        s_cselect_b32 s4, s4, s8
-; VI-NEXT:        v_mov_b32_e32 v0, s4
-; VI-NEXT:        v_mov_b32_e32 v1, s9
-; VI-NEXT:        v_mov_b32_e32 v2, s7
-; VI-NEXT:        v_mov_b32_e32 v3, s5
-; VI-NEXT:        buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-NEXT:        s_endpgm
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x44
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-NEXT:    s_cselect_b32 s5, s4, s11
+; VI-NEXT:    s_cmp_eq_u32 s6, 2
+; VI-NEXT:    s_cselect_b32 s7, s4, s10
+; VI-NEXT:    s_cmp_eq_u32 s6, 1
+; VI-NEXT:    s_cselect_b32 s9, s4, s9
+; VI-NEXT:    s_cmp_eq_u32 s6, 0
+; VI-NEXT:    s_cselect_b32 s4, s4, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -716,32 +717,32 @@ define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s4, 3
-; SI-NEXT:    s_cselect_b32 s5, s11, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 2
-; SI-NEXT:    s_cselect_b32 s6, s10, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 1
-; SI-NEXT:    s_cselect_b32 s7, s9, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b32 s8, s8, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 7
-; SI-NEXT:    s_cselect_b32 s9, s15, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 6
-; SI-NEXT:    s_cselect_b32 s10, s14, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 5
-; SI-NEXT:    s_cselect_b32 s11, s13, 5
-; SI-NEXT:    s_cmp_lg_u32 s4, 4
-; SI-NEXT:    s_cselect_b32 s4, s12, 5
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_mov_b32_e32 v3, s9
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_mov_b32_e32 v5, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s5
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s15
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s14
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s13
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
+; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v8i32:
@@ -1121,95 +1122,107 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x8
-; SI-NEXT:    s_movk_i32 s7, 0xff
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s5, s11, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 15
-; SI-NEXT:    s_cselect_b32 s5, s5, 5
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_lshr_b32 s6, s11, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 14
-; SI-NEXT:    s_cselect_b32 s6, s6, 5
-; SI-NEXT:    s_and_b32 s6, s6, s7
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
+; SI-NEXT:    s_lshr_b32 s5, s11, 16
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_movk_i32 s5, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    v_and_b32_e32 v1, s5, v1
 ; SI-NEXT:    s_lshr_b32 s6, s11, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 13
-; SI-NEXT:    s_cselect_b32 s6, s6, 5
-; SI-NEXT:    s_lshl_b32 s6, s6, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 12
-; SI-NEXT:    s_cselect_b32 s11, s11, 5
-; SI-NEXT:    s_and_b32 s11, s11, s7
-; SI-NEXT:    s_or_b32 s6, s11, s6
-; SI-NEXT:    s_mov_b32 s11, 0xffff
-; SI-NEXT:    s_and_b32 s6, s6, s11
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_lshr_b32 s6, s10, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 11
-; SI-NEXT:    s_cselect_b32 s6, s6, 5
-; SI-NEXT:    s_lshl_b32 s6, s6, 8
-; SI-NEXT:    s_lshr_b32 s12, s10, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 10
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_and_b32 s12, s12, s7
-; SI-NEXT:    s_or_b32 s6, s12, s6
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshr_b32 s12, s10, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 9
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_lshl_b32 s12, s12, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 8
-; SI-NEXT:    s_cselect_b32 s10, s10, 5
-; SI-NEXT:    s_and_b32 s10, s10, s7
-; SI-NEXT:    s_or_b32 s10, s10, s12
-; SI-NEXT:    s_and_b32 s10, s10, s11
-; SI-NEXT:    s_or_b32 s6, s10, s6
-; SI-NEXT:    s_lshr_b32 s10, s9, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 7
-; SI-NEXT:    s_cselect_b32 s10, s10, 5
-; SI-NEXT:    s_lshl_b32 s10, s10, 8
-; SI-NEXT:    s_lshr_b32 s12, s9, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 6
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_and_b32 s12, s12, s7
-; SI-NEXT:    s_or_b32 s10, s12, s10
-; SI-NEXT:    s_lshl_b32 s10, s10, 16
-; SI-NEXT:    s_lshr_b32 s12, s9, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 5
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_lshl_b32 s12, s12, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 4
-; SI-NEXT:    s_cselect_b32 s9, s9, 5
-; SI-NEXT:    s_and_b32 s9, s9, s7
-; SI-NEXT:    s_or_b32 s9, s9, s12
-; SI-NEXT:    s_and_b32 s9, s9, s11
-; SI-NEXT:    s_or_b32 s9, s9, s10
-; SI-NEXT:    s_lshr_b32 s10, s8, 24
-; SI-NEXT:    s_cmp_lg_u32 s4, 3
-; SI-NEXT:    s_cselect_b32 s10, s10, 5
-; SI-NEXT:    s_lshl_b32 s10, s10, 8
-; SI-NEXT:    s_lshr_b32 s12, s8, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 2
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_and_b32 s12, s12, s7
-; SI-NEXT:    s_or_b32 s10, s12, s10
-; SI-NEXT:    s_lshl_b32 s10, s10, 16
-; SI-NEXT:    s_lshr_b32 s12, s8, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 1
-; SI-NEXT:    s_cselect_b32 s12, s12, 5
-; SI-NEXT:    s_lshl_b32 s12, s12, 8
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
-; SI-NEXT:    s_cselect_b32 s4, s8, 5
-; SI-NEXT:    s_and_b32 s4, s4, s7
-; SI-NEXT:    s_or_b32 s4, s4, s12
-; SI-NEXT:    s_and_b32 s4, s4, s11
-; SI-NEXT:    s_or_b32 s4, s4, s10
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
+; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    v_and_b32_e32 v2, s5, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    s_mov_b32 s6, 0xffff
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_lshr_b32 s7, s10, 24
+; SI-NEXT:    v_or_b32_e32 v3, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
+; SI-NEXT:    s_lshr_b32 s7, s10, 16
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    s_lshr_b32 s7, s10, 8
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
+; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    v_and_b32_e32 v2, s5, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_lshr_b32 s7, s9, 24
+; SI-NEXT:    v_or_b32_e32 v2, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; SI-NEXT:    s_lshr_b32 s7, s9, 16
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    v_and_b32_e32 v1, s5, v1
+; SI-NEXT:    s_lshr_b32 s7, s9, 8
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s9
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
+; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    v_and_b32_e32 v4, s5, v4
+; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v1, s6, v1
+; SI-NEXT:    s_lshr_b32 s7, s8, 24
+; SI-NEXT:    v_or_b32_e32 v1, v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    s_lshr_b32 s7, s8, 16
+; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    v_and_b32_e32 v4, s5, v4
+; SI-NEXT:    s_lshr_b32 s7, s8, 8
+; SI-NEXT:    v_or_b32_e32 v0, v4, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-NEXT:    v_and_b32_e32 v5, s5, v5
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_and_b32_e32 v4, s6, v4
+; SI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index f60db1a4627b..589bb63397b3 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -75,7 +75,8 @@ entry:
 ; Check that the select instruction is not deleted.
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
-; SI: s_cselect
+; SI: v_cndmask
+; GCN2: s_cselect
 define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8

diff  --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 514223d47d88..3a4a2d07772c 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -134,7 +134,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cmp_gt_u32  s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -254,12 +254,12 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
 
 ; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
 ; GCN: s_load_dword
-; GCN-DAG: s_bfe_u32
+; GCN: s_bfe_u32
 ; GCN-DAG: s_sub_i32
 ; GCN-DAG: s_and_b32
 ; GCN-DAG: s_sub_i32
 ; GCN-DAG: s_lshr_b32
-; GCN: s_add_i32
+; GCN: v_add_i32_e32
 define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
@@ -273,10 +273,10 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
+; GCN: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
@@ -294,7 +294,7 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)*
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 41c11f3b9153..eaa36c1ae1d8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -392,76 +392,75 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrsp
 define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; GCN-LABEL: sdiv_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s2, s6
-; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s10
-; GCN-NEXT:    s_mov_b32 s1, s11
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GCN-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GCN-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT:    v_xor_b32_e32 v8, v4, v5
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GCN-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GCN-NEXT:    v_xor_b32_e32 v0, v0, v7
-; GCN-NEXT:    v_xor_b32_e32 v4, v7, v4
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GCN-NEXT:    v_mul_f32_e32 v5, s2, v5
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v6, v5
-; GCN-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; GCN-NEXT:    v_mul_lo_u32 v8, v5, v2
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v8, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, s2, v7
+; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GCN-NEXT:    v_mul_lo_u32 v10, v10, v5
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v11, v11, v7
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GCN-NEXT:    v_mul_hi_u32 v4, v5, v10
+; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT:    v_mul_lo_u32 v10, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
-; GCN-NEXT:    v_subrev_i32_e32 v8, vcc, v2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
+; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, v3, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GCN-NEXT:    s_mov_b64 s[0:1], vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v6, v3
-; GCN-NEXT:    v_xor_b32_e32 v2, v0, v6
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GCN-NEXT:    v_xor_b32_e32 v6, v3, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
-; GCN-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v9, v9, v0
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v9
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GCN-NEXT:    v_mul_hi_u32 v3, v1, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, v5, v4
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -800,137 +799,134 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32>
 define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; GCN-LABEL: sdiv_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    s_mov_b32 s2, s6
-; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s6, s10
+; GCN-NEXT:    s_mov_b32 s7, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s10
-; GCN-NEXT:    s_mov_b32 s1, s11
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; GCN-NEXT:    s_mov_b32 s10, 0x4f7ffffe
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCN-NEXT:    s_mov_b32 s2, 0x4f7ffffe
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT:    v_xor_b32_e32 v5, v5, v11
+; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
+; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GCN-NEXT:    v_mul_f32_e32 v4, s10, v4
-; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
-; GCN-NEXT:    v_mul_lo_u32 v10, v4, v9
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0
-; GCN-NEXT:    v_mul_hi_u32 v10, v9, v10
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v8, v4
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; GCN-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
 ; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_ashrrev_i32_e32 v11, 31, v4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GCN-NEXT:    v_xor_b32_e32 v4, v4, v11
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GCN-NEXT:    v_xor_b32_e32 v8, v11, v8
-; GCN-NEXT:    v_mul_lo_u32 v12, v9, v0
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v9
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
-; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v4, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v9
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
-; GCN-NEXT:    s_mov_b64 s[0:1], vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v10, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, v0, v10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v1
-; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v1
-; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v4, v4, v10
-; GCN-NEXT:    v_mul_f32_e32 v0, s10, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; GCN-NEXT:    v_mul_lo_u32 v13, v13, v0
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v13
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GCN-NEXT:    v_mul_hi_u32 v11, v5, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, v9, v8
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v8, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v11
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v5, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v5, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v8
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
-; GCN-NEXT:    s_mov_b64 s[0:1], vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v14, v2
-; GCN-NEXT:    v_xor_b32_e32 v2, v1, v14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GCN-NEXT:    v_mul_f32_e32 v9, s2, v9
+; GCN-NEXT:    v_xor_b32_e32 v6, v6, v13
+; GCN-NEXT:    v_xor_b32_e32 v16, v10, v11
+; GCN-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; GCN-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GCN-NEXT:    v_mul_f32_e32 v8, s2, v8
+; GCN-NEXT:    v_xor_b32_e32 v17, v12, v13
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; GCN-NEXT:    v_mul_lo_u32 v12, v12, v9
+; GCN-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GCN-NEXT:    v_xor_b32_e32 v1, v1, v10
+; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT:    v_mul_lo_u32 v10, v10, v8
+; GCN-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GCN-NEXT:    v_mul_f32_e32 v11, s2, v11
+; GCN-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GCN-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v6
+; GCN-NEXT:    v_mul_lo_u32 v12, v12, v11
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v8
+; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; GCN-NEXT:    v_mul_hi_u32 v12, v11, v12
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; GCN-NEXT:    v_xor_b32_e32 v7, v7, v14
+; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v7
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GCN-NEXT:    v_mul_lo_u32 v12, v8, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_mul_hi_u32 v11, v2, v11
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v12, vcc, v0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v0, v9, v5
+; GCN-NEXT:    v_mul_f32_e32 v10, s2, v10
+; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v10
+; GCN-NEXT:    v_mul_lo_u32 v10, v11, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v9, vcc, v0, v5
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 1, v11
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
+; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
+; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v16
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v7
+; GCN-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, s10, v1
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, v5, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v6
-; GCN-NEXT:    v_xor_b32_e32 v5, v5, v10
-; GCN-NEXT:    v_mul_hi_u32 v6, v5, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, v8, v4
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
-; GCN-NEXT:    v_xor_b32_e32 v10, v10, v14
-; GCN-NEXT:    v_mul_lo_u32 v4, v6, v2
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v4, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
-; GCN-NEXT:    s_mov_b64 s[0:1], vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v9, v3
-; GCN-NEXT:    v_xor_b32_e32 v3, v2, v9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v3
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v4, v7
-; GCN-NEXT:    v_xor_b32_e32 v9, v4, v9
-; GCN-NEXT:    v_xor_b32_e32 v4, v7, v4
-; GCN-NEXT:    v_mul_f32_e32 v2, s10, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_lo_u32 v8, v8, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v8
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v4, v2
-; GCN-NEXT:    v_xor_b32_e32 v2, v5, v10
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v10, v2
-; GCN-NEXT:    v_mul_lo_u32 v5, v6, v3
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v6
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v4, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GCN-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v9, v3
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
+; GCN-NEXT:    v_mul_lo_u32 v5, v4, v7
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
+; GCN-NEXT:    v_xor_b32_e32 v6, v9, v14
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v7
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GCN-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; TONGA-LABEL: sdiv_v4i32:
@@ -1497,7 +1493,7 @@ define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %i
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -1830,7 +1826,7 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, |v2|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -2020,7 +2016,7 @@ define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)*
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 07df1108df56..1d235d1db5cd 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
@@ -14,124 +15,123 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s2, s2, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    s_addc_u32 s3, s3, s12
-; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[12:13]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GCN-NEXT:    s_sub_u32 s2, 0, s14
-; GCN-NEXT:    s_subb_u32 s3, 0, s15
-; GCN-NEXT:    s_ashr_i32 s16, s11, 31
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
+; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s17, s16
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s15, s14
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s10, s16
-; GCN-NEXT:    s_addc_u32 s1, s11, s16
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GCN-NEXT:    s_add_u32 s0, s10, s14
+; GCN-NEXT:    s_addc_u32 s1, s11, s14
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, s11, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, s11, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s14, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s11
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[16:17], s[12:13]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -144,98 +144,104 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-LABEL: s_test_sdiv:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i32 s2, s9, 31
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_subb_u32 s11, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[12:13], s[14:15]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s12, s14
+; GCN-IR-NEXT:    s_ashr_i32 s8, s1, 31
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s10, s6, s2
+; GCN-IR-NEXT:    s_mov_b32 s9, s8
+; GCN-IR-NEXT:    s_subb_u32 s11, s7, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s6, s0, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s6
+; GCN-IR-NEXT:    s_subb_u32 s7, s1, s8
+; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
 ; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
-; GCN-IR-NEXT:    s_cmp_eq_u32 s11, 0
-; GCN-IR-NEXT:    s_cselect_b32 s16, s14, s15
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_xor_b64 s[22:23], s[18:19], -1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[22:23], s[20:21]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[10:11], s14
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[14:15], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[12:13], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[10:11], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[10:11], v4
 ; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
-; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    s_branch BB0_6
-; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:  BB0_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[18:19]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[18:19]
-; GCN-IR-NEXT:  BB0_6: ; %Flow7
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_6: ; %Flow6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %Flow7
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
@@ -506,15 +512,15 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s6, s4, 1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -536,15 +542,15 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -613,16 +619,16 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_xor_b32 s4, s7, s6
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -641,16 +647,16 @@ define amdgpu_kernel void @s_test_sdiv32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_xor_b32 s4, s7, s6
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -678,15 +684,15 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s6, s4, 1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -708,15 +714,15 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -745,15 +751,15 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s6, s4, 1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -775,15 +781,15 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -812,15 +818,15 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    s_or_b32 s6, s4, 1
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -842,15 +848,15 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s4, s4, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -876,35 +882,35 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
 ; GCN-NEXT:    s_xor_b32 s0, s8, s0
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
-; GCN-NEXT:    s_or_b32 s3, s0, 1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s3, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s10
 ; GCN-NEXT:    s_xor_b32 s0, s10, s2
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -923,35 +929,35 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(<2 x i64> addrspace(1)* %out, <2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s8
 ; GCN-IR-NEXT:    s_xor_b32 s0, s8, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
-; GCN-IR-NEXT:    s_or_b32 s3, s0, 1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-IR-NEXT:    s_cselect_b32 s0, s3, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[10:11], 40
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v2, s2
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, s10
 ; GCN-IR-NEXT:    s_xor_b32 s0, s10, s2
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, s0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, s0, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -992,7 +998,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1008,99 +1014,105 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 24
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
-; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
-; GCN-IR-NEXT:    s_ashr_i64 s[12:13], s[6:7], 24
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s10, s6, s0
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_subb_u32 s11, s7, s0
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[12:13], s[14:15]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s12, s14
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 24
+; GCN-IR-NEXT:    s_ashr_i32 s6, s7, 31
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s10, s0, s2
+; GCN-IR-NEXT:    s_mov_b32 s7, s6
+; GCN-IR-NEXT:    s_subb_u32 s11, s1, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s8, s0, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s8
+; GCN-IR-NEXT:    s_subb_u32 s9, s1, s6
+; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
 ; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
-; GCN-IR-NEXT:    s_cmp_eq_u32 s11, 0
-; GCN-IR-NEXT:    s_cselect_b32 s16, s14, s15
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_xor_b64 s[22:23], s[18:19], -1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[22:23], s[20:21]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
-; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[10:11], s14
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[14:15], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[12:13], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[10:11], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[10:11], v4
+; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_3
-; GCN-IR-NEXT:  BB9_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    s_branch BB9_6
-; GCN-IR-NEXT:  BB9_5:
+; GCN-IR-NEXT:  BB9_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[18:19]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[18:19]
-; GCN-IR-NEXT:  BB9_6: ; %Flow4
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB9_7
+; GCN-IR-NEXT:  BB9_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB9_6: ; %Flow3
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB9_7: ; %Flow4
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
@@ -1124,15 +1136,15 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s8, s7, 31
-; GCN-NEXT:    s_add_u32 s0, s6, s8
-; GCN-NEXT:    s_addc_u32 s1, s7, s8
-; GCN-NEXT:    s_mov_b32 s9, s8
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT:    s_sub_u32 s2, 0, s10
-; GCN-NEXT:    s_subb_u32 s3, 0, s11
+; GCN-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-NEXT:    s_add_u32 s0, s6, s2
+; GCN-NEXT:    s_addc_u32 s1, s7, s2
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GCN-NEXT:    s_sub_u32 s3, 0, s8
+; GCN-NEXT:    s_subb_u32 s10, 0, s9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
@@ -1144,10 +1156,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s10, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -1166,11 +1178,11 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s3, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s10, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
@@ -1200,133 +1212,139 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 0, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v2, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], 24, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 24, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_sdiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
-; GCN-IR-NEXT:    s_mov_b32 s5, s4
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s10, s6, s7
-; GCN-IR-NEXT:    s_add_u32 s8, s10, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[14:15], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[16:17], s[12:13], -1
-; GCN-IR-NEXT:    s_and_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], 24, s8
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s6, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s7, s1, s2
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], 24, v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], 24, s14
-; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0
+; GCN-IR-NEXT:    s_add_u32 s8, s6, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 58, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s9, s7, -1
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[0:1], 0, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s6, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s6, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s8, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_3
-; GCN-IR-NEXT:  BB10_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB10_6
-; GCN-IR-NEXT:  BB10_5:
+; GCN-IR-NEXT:  BB10_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[12:13]
-; GCN-IR-NEXT:  BB10_6: ; %udiv-end
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s5, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB10_7
+; GCN-IR-NEXT:  BB10_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB10_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB10_7: ; %udiv-end
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -1877,16 +1895,16 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1904,16 +1922,16 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s2, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s3
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1936,17 +1954,17 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_ashr_i32 s4, s6, 30
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v0, -v1, s8, v0
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    s_or_b32 s6, s4, 1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, s8
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
@@ -1962,17 +1980,17 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s6, 30
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s8, v0
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, s8
-; GCN-IR-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT:    s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s4, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index 7054b013b0a6..24df126e4caf 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -7,10 +7,8 @@
 ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc,
 ; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 [[CMP1]], vcc, [[CMP1]]
-; GCN: s_cselect_b32 [[SRESULT:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN-NOT: [[SRESULT]]
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
@@ -25,10 +23,8 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i3
 ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 [[CMP1]], vcc, [[CMP1]]
-; GCN: s_cselect_b32 [[SRESULT:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN-NOT: [[SRESULT]]
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
+; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
@@ -75,10 +71,8 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, fl
 ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc,
 ; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 [[CMP1]], vcc, [[CMP1]]
-; GCN: s_cselect_b32 [[SRESULT:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN-NOT: [[SRESULT]]
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
@@ -94,10 +88,8 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_f32:
 ; GCN-DAG: v_cmp_lg_f32_e32 vcc
 ; GCN-DAG: v_cmp_lg_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 [[CMP1]], vcc, [[CMP1]]
-; GCN: s_cselect_b32 [[SRESULT:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GCN-NOT: [[SRESULT]]
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
+; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {

diff  --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index 9e6d1a7cf76b..9b297ee1d7ee 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -66,8 +66,11 @@ define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8
 }
 
 ; GCN-LABEL: {{^}}select_v4i8:
-; GCN: s_cselect_b32
-; GCN-NOT: s_cselect_b32
+; GFX89: s_cselect_b32
+; GFX89-NOT: s_cselect_b32
+
+; SI: v_cndmask_b32
+; SI-NOT: cndmask
 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
@@ -82,8 +85,8 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a,
 ; GFX89: s_cselect_b32
 ; GFX89-NOT: s_cselect_b32
 
-; SI: s_cselect_b32
-; SI-NOT: s_cselect_b32
+; SI: v_cndmask_b32_e32
+; SI-NOT: v_cndmask_b32e
 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b

diff  --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index 9c492dcfb351..61ce9c12526d 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck -check-prefixes=SI,GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI,GCN %s
 
-; CHECK-LABEL: {{^}}select0:
+; GCN-LABEL: {{^}}select0:
 ; i64 select should be split into two i32 selects, and we shouldn't need
 ; to use a shfit to extract the hi dword of the input.
-; CHECK-NOT: s_lshr_b64
-; CHECK: v_cndmask
-; CHECK: v_cndmask
+; GCN-NOT: s_lshr_b64
+; GCN: v_cndmask
+; GCN: v_cndmask
 define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
@@ -15,9 +15,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: {{^}}select_trunc_i64:
-; CHECK: s_cselect_b32
-; CHECK-NOT: s_cselect_b32
+; GCN-LABEL: {{^}}select_trunc_i64:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
@@ -26,9 +28,11 @@ define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i
   ret void
 }
 
-; CHECK-LABEL: {{^}}select_trunc_i64_2:
-; CHECK: s_cselect_b32
-; CHECK-NOT: s_cselect_b32
+; GCN-LABEL: {{^}}select_trunc_i64_2:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
@@ -37,9 +41,11 @@ define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond,
   ret void
 }
 
-; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
-; CHECK: s_cselect_b32
-; CHECK-NOT: s_cselect_b32
+; GCN-LABEL: {{^}}v_select_trunc_i64_2:
+; VI: s_cselect_b32
+; VI-NOT: s_cselect_b32
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
 define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
@@ -50,10 +56,10 @@ define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %con
   ret void
 }
 
-; CHECK-LABEL: {{^}}v_select_i64_split_imm:
-; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
-; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
-; CHECK: s_endpgm
+; GCN-LABEL: {{^}}v_select_i64_split_imm:
+; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+; GCN-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
+; GCN: s_endpgm
 define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 833a94174828..9ae6a3096e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -15,12 +15,18 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32
 ; uses an SGPR (implicit vcc).
 
 ; GCN-LABEL: {{^}}sint_to_fp_i1_f64:
-; GCN-DAG: s_cmp_eq_u32
-; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
-; GCN: s_endpgm
+; VI-DAG: s_cmp_eq_u32
+; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0xbff00000, 0
+; VI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
+; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; VI: s_endpgm
+
+; SI-DAG: v_cmp_eq_u32_e64 vcc,
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
+; SI: s_endpgm
 define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 746afd333b36..1f961fcf1a3b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
@@ -17,100 +18,99 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
@@ -119,7 +119,7 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -127,91 +127,97 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-LABEL: s_test_srem:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_cmp_eq_u32 s1, 0
-; GCN-IR-NEXT:    s_cselect_b32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[12:13], -1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s17, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[6:7], s8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[6:7], s16
-; GCN-IR-NEXT:    s_add_u32 s8, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s2, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s3, s11
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s2, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s2, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s2, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s8, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s2, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s3, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
-; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB0_6
-; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:  BB0_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  BB0_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s0, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_6: ; %Flow6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
@@ -484,17 +490,17 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -516,17 +522,17 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-IR-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -555,17 +561,17 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -587,17 +593,17 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-IR-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -680,17 +686,17 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -712,17 +718,17 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-IR-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -751,17 +757,17 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -783,17 +789,17 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-IR-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s1, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -811,28 +817,27 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s7
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    s_xor_b32 s0, s7, s6
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s8, s0, 1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-NEXT:    s_xor_b32 s1, s7, s0
+; GCN-NEXT:    s_ashr_i32 s1, s1, 30
+; GCN-NEXT:    s_or_b32 s1, s1, 1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s8, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -841,28 +846,27 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-LABEL: s_test_srem32_64:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s7
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-IR-NEXT:    s_xor_b32 s0, s7, s6
-; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-IR-NEXT:    s_or_b32 s8, s0, 1
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_xor_b32 s1, s7, s0
+; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
+; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-IR-NEXT:    s_cselect_b32 s0, s8, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -878,124 +882,124 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64
 define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem33_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 31
+; GCN-NEXT:    s_ashr_i64 s[2:3], s[10:11], 31
+; GCN-NEXT:    s_ashr_i64 s[4:5], s[0:1], 31
 ; GCN-NEXT:    s_ashr_i32 s0, s1, 31
-; GCN-NEXT:    s_add_u32 s8, s8, s0
+; GCN-NEXT:    s_add_u32 s4, s4, s0
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_addc_u32 s9, s9, s0
-; GCN-NEXT:    s_xor_b64 s[12:13], s[8:9], s[0:1]
+; GCN-NEXT:    s_addc_u32 s5, s5, s0
+; GCN-NEXT:    s_xor_b64 s[12:13], s[4:5], s[0:1]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s6, 0, s12
-; GCN-NEXT:    s_subb_u32 s8, 0, s13
-; GCN-NEXT:    s_mov_b32 s9, s5
+; GCN-NEXT:    s_sub_u32 s4, 0, s12
+; GCN-NEXT:    s_subb_u32 s5, 0, s13
+; GCN-NEXT:    s_ashr_i32 s10, s11, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s11, s10
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s8, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s6, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s8, v0
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    s_ashr_i32 s6, s7, 31
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s2, s6
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    s_addc_u32 s1, s3, s6
-; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[6:7]
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s14, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s15, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s15, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GCN-NEXT:    s_add_u32 s0, s2, s10
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    s_addc_u32 s1, s3, s10
+; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[10:11]
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s14, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s14, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s14, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, s15, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, s15, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
+; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s14, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s15, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s14, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s15
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
@@ -1004,14 +1008,14 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s6, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem33_64:
@@ -1019,112 +1023,118 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[6:7], 31
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 31
-; GCN-IR-NEXT:    s_ashr_i32 s10, s1, 31
-; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
+; GCN-IR-NEXT:    s_ashr_i32 s2, s7, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[0:1], 31
+; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[10:11], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s6, s6, s2
+; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
+; GCN-IR-NEXT:    s_sub_u32 s8, s8, s0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
+; GCN-IR-NEXT:    s_subb_u32 s9, s9, s0
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s19, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[2:3], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_3
-; GCN-IR-NEXT:  BB8_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    s_branch BB8_6
+; GCN-IR-NEXT:  BB8_4:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB8_7
 ; GCN-IR-NEXT:  BB8_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  BB8_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB8_6: ; %Flow6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB8_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -1164,7 +1174,7 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -1182,113 +1192,119 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[2:3], 24
 ; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
-; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT:    s_ashr_i32 s12, s7, 31
-; GCN-IR-NEXT:    s_mov_b32 s1, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[6:7], 24
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[8:9], s[0:1]
-; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
-; GCN-IR-NEXT:    s_sub_u32 s6, s6, s12
-; GCN-IR-NEXT:    s_subb_u32 s7, s7, s12
+; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s3, s2
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 24
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s6, s0, s2
+; GCN-IR-NEXT:    s_subb_u32 s7, s1, s2
+; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
+; GCN-IR-NEXT:    s_subb_u32 s9, s9, s10
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
-; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s19, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[2:3], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_3
-; GCN-IR-NEXT:  BB9_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    s_branch BB9_6
+; GCN-IR-NEXT:  BB9_4:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB9_7
 ; GCN-IR-NEXT:  BB9_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  BB9_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB9_6: ; %Flow3
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB9_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1388,25 +1404,25 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], 24, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s8, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, 0, v1, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
@@ -1414,98 +1430,104 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_srem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
-; GCN-IR-NEXT:    s_mov_b32 s5, s4
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s4, s5
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_ashr_i32 s0, s7, 31
+; GCN-IR-NEXT:    s_mov_b32 s1, s0
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[6:7], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
+; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[6:7], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], 24, v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
 ; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 58, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[0:1], 0, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s4, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s4, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s6, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s2, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s3, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_3
-; GCN-IR-NEXT:  BB10_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    s_branch BB10_6
-; GCN-IR-NEXT:  BB10_5:
+; GCN-IR-NEXT:  BB10_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  BB10_6: ; %udiv-end
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB10_7
+; GCN-IR-NEXT:  BB10_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB10_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB10_7: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = srem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -2054,17 +2076,17 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GCN-NEXT:    s_ashr_i32 s5, s4, 30
 ; GCN-NEXT:    s_or_b32 s5, s5, 1
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v0, s6
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s5, s5, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s5, v1
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -2081,17 +2103,17 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GCN-IR-NEXT:    s_ashr_i32 s5, s4, 30
 ; GCN-IR-NEXT:    s_or_b32 s5, s5, 1
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_mad_f32 v2, -v1, v0, s6
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-IR-NEXT:    s_cselect_b32 s5, s5, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s5, v1
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -2115,16 +2137,16 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GCN-NEXT:    s_ashr_i32 s0, s6, 30
-; GCN-NEXT:    s_or_b32 s7, s0, 1
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mad_f32 v0, -v1, s1, v0
-; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s7, 0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v0, -v2, s1, v0
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s1
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s0, 0x5b7f
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
@@ -2144,16 +2166,16 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s6
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s6, 30
-; GCN-IR-NEXT:    s_or_b32 s7, s0, 1
-; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
-; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_mad_f32 v0, -v1, s1, v0
-; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, s1
-; GCN-IR-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-IR-NEXT:    s_cselect_b32 s0, s7, 0
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v0, -v2, s1, v0
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s1
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GCN-IR-NEXT:    s_movk_i32 s0, 0x5b7f
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5

diff  --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index 3b6cda0c860e..29604b74db3a 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -98,8 +98,9 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a)
 ; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
-; GCN: s_cmp_lg_u32 s[[VLO]], 0
-; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
+; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
+; VI: s_cmp_lg_u64 s{{\[}}[[VLO]]:[[VHI]]], 0
+; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index f18d35f1fef0..779c37b65a28 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -5,121 +5,121 @@
 define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s2, 0, s12
-; GCN-NEXT:    s_subb_u32 s3, 0, s13
-; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s12, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s13, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s12, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s12, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v6, s11
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -128,87 +128,93 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-LABEL: s_test_udiv_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_cmp_eq_u32 s1, 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    s_cselect_b32 s8, s12, s8
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
-; GCN-IR-NEXT:    s_add_u32 s6, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[8:9]
-; GCN-IR-NEXT:    s_mov_b32 s13, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s2, s12
-; GCN-IR-NEXT:    s_addc_u32 s9, s3, s9
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s2, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s2, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s6, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s2, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s3, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
-; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    s_branch BB0_6
-; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:  BB0_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  BB0_6: ; %udiv-end
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_6: ; %Flow6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -795,24 +801,24 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, v0, v5
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 2, v1
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, v0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v6, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v7, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v8, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -832,87 +838,93 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_and_b32 s0, s2, s9
 ; GCN-IR-NEXT:    s_and_b32 s3, s7, s8
 ; GCN-IR-NEXT:    s_and_b32 s2, s6, s9
-; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    s_mov_b64 s[0:1], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s8, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s12, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
 ; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_not_b64 s[0:1], s[8:9]
-; GCN-IR-NEXT:    s_mov_b32 s13, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s0, s12
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, s9
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s1, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s0, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s0, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s0, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s0, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s6, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s2, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s3, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_3
-; GCN-IR-NEXT:  BB7_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    s_branch BB7_6
-; GCN-IR-NEXT:  BB7_5:
+; GCN-IR-NEXT:  BB7_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
-; GCN-IR-NEXT:  BB7_6: ; %udiv-end
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB7_7
+; GCN-IR-NEXT:  BB7_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB7_6: ; %Flow3
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB7_7: ; %udiv-end
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
@@ -1008,31 +1020,31 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v3, s6, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], 24, v3
-; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v3
-; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 24, v3
+; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s6, v3
+; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
+; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
+; GCN-NEXT:    v_subb_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -1040,78 +1052,84 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ;
 ; GCN-IR-LABEL: s_test_udiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s4, s5
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[2:3], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], 24, v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
-; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_add_u32 s2, s6, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 58, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s3, s7, -1
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[0:1], 0, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s4, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s4, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s2, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_3
-; GCN-IR-NEXT:  BB8_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB8_6
-; GCN-IR-NEXT:  BB8_5:
+; GCN-IR-NEXT:  BB8_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  BB8_6: ; %udiv-end
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB8_7
+; GCN-IR-NEXT:  BB8_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB8_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB8_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -1486,8 +1504,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, 24
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
+; GCN-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v4
 ; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
@@ -1498,93 +1516,98 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v4
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s4, s5
-; GCN-IR-NEXT:    s_sub_u32 s6, 59, s8
-; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz BB11_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 59, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[2:3], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB11_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s10
-; GCN-IR-NEXT:    s_add_u32 s2, s8, 0xffffffc4
-; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v3
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffc4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, -1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s7, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s4, 23, s10
-; GCN-IR-NEXT:    s_subb_u32 s4, 0, s11
-; GCN-IR-NEXT:    s_ashr_i32 s8, s4, 31
-; GCN-IR-NEXT:    s_and_b32 s4, s8, 1
-; GCN-IR-NEXT:    s_and_b32 s8, s8, 24
-; GCN-IR-NEXT:    s_sub_u32 s10, s10, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    s_subb_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-IR-NEXT:    s_add_u32 s2, s2, 1
-; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 23, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 24, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB11_3
-; GCN-IR-NEXT:  BB11_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB11_6
+; GCN-IR-NEXT:  BB11_4:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB11_7
 ; GCN-IR-NEXT:  BB11_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[10:11]
-; GCN-IR-NEXT:  BB11_6: ; %udiv-end
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB11_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB11_7: ; %udiv-end
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index fe613f2579e4..bcd879e8f3d9 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -36,40 +36,40 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
 ;
 ; GFX6-LABEL: test_udivrem:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:        s_load_dword s3, s[0:1], 0x26
-; GFX6-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0x13
-; GFX6-NEXT:        s_load_dword s0, s[0:1], 0x1d
-; GFX6-NEXT:        s_mov_b32 s7, 0xf000
-; GFX6-NEXT:        s_mov_b32 s6, -1
-; GFX6-NEXT:        s_mov_b32 s10, s6
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:        s_sub_i32 s2, 0, s3
-; GFX6-NEXT:        s_mov_b32 s11, s7
-; GFX6-NEXT:        v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:        v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:        v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:        v_mul_lo_u32 v1, s2, v0
-; GFX6-NEXT:        v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:        v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:        v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:        v_mul_lo_u32 v1, v0, s3
-; GFX6-NEXT:        v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:        v_sub_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:        v_cmp_le_u32_e64 s[0:1], s3, v1
-; GFX6-NEXT:        v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:        v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:        v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX6-NEXT:        v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:        v_cmp_le_u32_e64 s[0:1], s3, v1
-; GFX6-NEXT:        v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:        v_subrev_i32_e32 v2, vcc, s3, v1
-; GFX6-NEXT:        buffer_store_dword v0, off, s[4:7], 0
-; GFX6-NEXT:        s_waitcnt expcnt(0)
-; GFX6-NEXT:        v_cndmask_b32_e64 v0, v1, v2, s[0:1]
-; GFX6-NEXT:        buffer_store_dword v0, off, s[8:11], 0
-; GFX6-NEXT:        s_endpgm
+; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x26
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x1d
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s10, s6
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s2, 0, s3
+; GFX6-NEXT:    s_mov_b32 s11, s7
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, v2, s[0:1]
+; GFX6-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_udivrem:
 ; GFX8:       ; %bb.0:
@@ -156,39 +156,39 @@ define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i3
 ; GFX6-LABEL: test_udivrem_v2:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s3, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s3, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s7
+; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -319,72 +319,72 @@ define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i3
 ; GFX6-LABEL: test_udivrem_v4:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s11
+; GFX6-NEXT:    s_sub_i32 s2, 0, s8
+; GFX6-NEXT:    s_sub_i32 s12, 0, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s3, 0, s9
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
+; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s10
-; GFX6-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v4
-; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v1
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX6-NEXT:    v_mul_f32_e32 v3, s12, v3
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX6-NEXT:    s_sub_i32 s2, 0, s11
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX6-NEXT:    s_sub_i32 s4, 0, s10
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GFX6-NEXT:    s_sub_i32 s4, 0, s11
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 97a6d0436ee8..de0c17912a6b 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -75,9 +75,11 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)
 ; uses an SGPR (implicit vcc).
 
 ; GCN-LABEL: {{^}}uint_to_fp_i1_to_f64:
-; GCN-DAG: s_cmp_eq_u32
-; GCN-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
-; GCN-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
+; VI-DAG: s_cmp_eq_u32
+; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
+; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
+; SI-DAG: v_cmp_eq_u32_e64 vcc
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; GCN: s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cec4df0cd295..53af9618271d 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -6,6 +6,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-LABEL: s_test_urem_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
@@ -17,100 +18,99 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s5, s9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v6, 0
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v9, vcc
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
 ; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
@@ -119,7 +119,7 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -127,91 +127,97 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-LABEL: s_test_urem_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
+; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_cmp_eq_u32 s1, 0
-; GCN-IR-NEXT:    s_cselect_b32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
-; GCN-IR-NEXT:    s_cselect_b32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[12:13], -1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
-; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s17, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[6:7], s8
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[6:7], s16
-; GCN-IR-NEXT:    s_add_u32 s8, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s2, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s3, s11
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0
+; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
+; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s2, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s2, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s2, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s8, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s2, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s3, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
-; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB0_6
-; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:  BB0_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
-; GCN-IR-NEXT:  BB0_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s0, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s1, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB0_7
+; GCN-IR-NEXT:  BB0_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB0_6: ; %Flow6
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB0_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
@@ -820,25 +826,25 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], 24, v0
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s6, v0
-; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v4
-; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_subb_u32_e64 v1, vcc, 0, v1, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
+; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s6, v0
+; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s7, v5
+; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v4
+; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s6, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s7, v5
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
@@ -846,93 +852,99 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s8, s4, s5
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
-; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz BB6_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
+; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[2:3], 0, -1, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB6_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], 24, v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB6_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
-; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    s_add_u32 s2, s6, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], 24, v3
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 58, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    s_addc_u32 s3, s7, -1
+; GCN-IR-NEXT:    v_subb_u32_e64 v5, s[0:1], 0, 0, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB6_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s4, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s4, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s2, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subb_u32_e64 v7, s[0:1], v7, v11, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB6_3
-; GCN-IR-NEXT:  BB6_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    s_branch BB6_6
-; GCN-IR-NEXT:  BB6_5:
+; GCN-IR-NEXT:  BB6_4:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[10:11]
-; GCN-IR-NEXT:  BB6_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s2, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, 24, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB6_7
+; GCN-IR-NEXT:  BB6_5:
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB6_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB6_7: ; %udiv-end
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
   store i64 %result, i64 addrspace(1)* %out
@@ -954,12 +966,12 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -972,7 +984,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
@@ -984,7 +996,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, s2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, s2
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
@@ -1004,15 +1016,15 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -1022,8 +1034,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
@@ -1033,99 +1045,104 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
-; GCN-IR-NEXT:    s_cselect_b32 s6, s4, s5
-; GCN-IR-NEXT:    s_sub_u32 s8, 59, s6
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
-; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
-; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
-; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s7, 63, s8
-; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 59, v2
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[2:3], 0, 0, vcc
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GCN-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
+; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[3:4], v[0:1]
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s10
-; GCN-IR-NEXT:    s_add_u32 s6, s6, 0xffffffc4
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
-; GCN-IR-NEXT:    s_mov_b32 s5, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v3
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 0xffffffc4, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, -1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:  BB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s4, 23, s12
-; GCN-IR-NEXT:    s_subb_u32 s4, 0, s13
-; GCN-IR-NEXT:    s_ashr_i32 s10, s4, 31
-; GCN-IR-NEXT:    s_and_b32 s4, s10, 1
-; GCN-IR-NEXT:    s_and_b32 s10, s10, 24
-; GCN-IR-NEXT:    s_sub_u32 s12, s12, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_subb_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
-; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GCN-IR-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v2, 31, v1
+; GCN-IR-NEXT:    v_or_b32_e32 v6, v6, v2
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, 23, v6
+; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v10, 24, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v8
+; GCN-IR-NEXT:    v_sub_i32_e64 v6, s[0:1], v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-IR-NEXT:    v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v2
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_3
-; GCN-IR-NEXT:  BB7_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], 1
-; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    s_branch BB7_6
+; GCN-IR-NEXT:  BB7_4:
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
+; GCN-IR-NEXT:    s_branch BB7_7
 ; GCN-IR-NEXT:  BB7_5:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[10:11]
-; GCN-IR-NEXT:  BB7_6: ; %udiv-end
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-IR-NEXT:  BB7_6: ; %Flow5
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
+; GCN-IR-NEXT:  BB7_7: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index 41f03ee0e37c..cfc31a6445ea 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_select_v2i32:
@@ -7,10 +7,15 @@
 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
 
-; SI: s_cmp_gt_i32
-; SI: s_cselect_b32
-; SI: s_cmp_gt_i32
-; SI: s_cselect_b32
+; VI: s_cmp_gt_i32
+; VI: s_cselect_b32
+; VI: s_cmp_gt_i32
+; VI: s_cselect_b32
+
+; SI: v_cmp_gt_i32_e32 vcc
+; SI: v_cndmask_b32_e32
+; SI: v_cmp_gt_i32_e32 vcc
+; SI: v_cndmask_b32_e32
 
 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
@@ -27,7 +32,6 @@ entry:
 ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-
 ; SI: v_cmp_neq_f32_e32 vcc
 ; SI: v_cndmask_b32_e32
 ; SI: v_cmp_neq_f32_e32 vcc
@@ -50,10 +54,15 @@ entry:
 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Z
 ; EG-DAG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW]}}, KC0[3].Y
 
-; SI: s_cselect_b32
-; SI: s_cselect_b32
-; SI: s_cselect_b32
-; SI: s_cselect_b32
+; VI: s_cselect_b32
+; VI: s_cselect_b32
+; VI: s_cselect_b32
+; VI: s_cselect_b32
+
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e32
 
 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:


        


More information about the llvm-commits mailing list