[llvm] d719f1c - AMDGPU: Add alloc priority to global ranges

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 10 10:12:51 PDT 2021


Author: Matt Arsenault
Date: 2021-08-10T13:12:34-04:00
New Revision: d719f1c3cc9c6f44438b4bd847816d7462945269

URL: https://github.com/llvm/llvm-project/commit/d719f1c3cc9c6f44438b4bd847816d7462945269
DIFF: https://github.com/llvm/llvm-project/commit/d719f1c3cc9c6f44438b4bd847816d7462945269.diff

LOG: AMDGPU: Add alloc priority to global ranges

The requested register class priorities weren't respected
globally. Not sure why this is a target option, and not just the
expected behavior (recently added in
1a6dc92be7d68611077f0fb0b723b361817c950c). This avoids an allocation
failure when many wide tuple spills are introduced. I think this is a
workaround since I would not expect the allocation priority to be
required, and only a performance hint. The allocator should be smarter
about when only a subregister needs to be spilled and restored.

This does regress a couple of degenerate store stress lit tests which
shouldn't be too important.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
    llvm/test/CodeGen/AMDGPU/ctpop16.ll
    llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
    llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
    llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
    llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
    llvm/test/CodeGen/AMDGPU/idiv-licm.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
    llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
    llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 2a92051e5fb2e..a2510d8fff34f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -70,6 +70,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                        CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const override;
 
+  bool addAllocPriorityToGlobalRanges() const override {
+    return true;
+  }
+
   // Stack access is very expensive. CSRs are also the high registers, and we
   // want to minimize the number of used registers.
   unsigned getCSRFirstUseCost() const override {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 40b38c61aadf0..de3e08b9a2db2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -7,18 +7,16 @@
 define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 {
 ; GCN-LABEL: v_insert_v64i32_varidx:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[20:23], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x10
 ; GCN-NEXT:    s_add_u32 s0, s0, s7
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
-; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x0
-; GCN-NEXT:    s_load_dwordx16 s[52:67], s[10:11], 0x40
-; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x80
-; GCN-NEXT:    s_and_b32 s4, s7, 63
-; GCN-NEXT:    s_lshl_b32 s4, s4, 2
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
+; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
+; GCN-NEXT:    s_load_dwordx16 s[4:19], s[22:23], 0x80
+; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NEXT:    v_mov_b32_e32 v1, s37
@@ -36,7 +34,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v13, s49
 ; GCN-NEXT:    v_mov_b32_e32 v14, s50
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
-; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0xc0
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:260
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:264
@@ -85,37 +83,37 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:376
 ; GCN-NEXT:    v_mov_b32_e32 v0, s67
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:380
-; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:384
-; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:388
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:392
-; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:396
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:400
-; GCN-NEXT:    v_mov_b32_e32 v0, s17
+; GCN-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:404
-; GCN-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:408
-; GCN-NEXT:    v_mov_b32_e32 v0, s19
+; GCN-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:412
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:416
-; GCN-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:420
-; GCN-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:424
-; GCN-NEXT:    v_mov_b32_e32 v0, s23
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:428
-; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:432
-; GCN-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NEXT:    v_mov_b32_e32 v0, s17
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:436
-; GCN-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:440
-; GCN-NEXT:    v_mov_b32_e32 v0, s27
+; GCN-NEXT:    v_mov_b32_e32 v0, s19
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:444
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
@@ -145,13 +143,15 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
+; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:508
 ; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v1, s24
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
 ; GCN-NEXT:    s_nop 0
@@ -219,37 +219,37 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
 ; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
 ; GCN-NEXT:    s_waitcnt vmcnt(60)
-; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[8:9]
+; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[20:21]
 ; GCN-NEXT:    s_waitcnt vmcnt(57)
-; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[8:9] offset:16
+; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(54)
-; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[8:9] offset:32
+; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
 ; GCN-NEXT:    s_waitcnt vmcnt(51)
-; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[8:9] offset:48
+; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(48)
-; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[8:9] offset:64
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
 ; GCN-NEXT:    s_waitcnt vmcnt(45)
-; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[8:9] offset:80
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
 ; GCN-NEXT:    s_waitcnt vmcnt(42)
-; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[8:9] offset:96
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
 ; GCN-NEXT:    s_waitcnt vmcnt(39)
-; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[8:9] offset:112
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
 ; GCN-NEXT:    s_waitcnt vmcnt(36)
-; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[8:9] offset:128
+; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
 ; GCN-NEXT:    s_waitcnt vmcnt(33)
-; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[8:9] offset:144
+; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
 ; GCN-NEXT:    s_waitcnt vmcnt(30)
-; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[8:9] offset:160
+; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
 ; GCN-NEXT:    s_waitcnt vmcnt(27)
-; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[8:9] offset:176
+; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
 ; GCN-NEXT:    s_waitcnt vmcnt(24)
-; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[8:9] offset:192
+; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(21)
-; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[8:9] offset:208
+; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(18)
-; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[8:9] offset:224
+; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
 ; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[8:9] offset:240
+; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
 ; GCN-NEXT:    s_endpgm
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
   %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index ffe8074eb5974..40b2131b21ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -946,9 +946,9 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64
-; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX7-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7-NEXT:    s_cbranch_execz BB13_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x1d
@@ -956,10 +956,10 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX7-NEXT:  BB13_2: ; %exit
-; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    s_and_b32 s0, 1, s2
+; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX7-NEXT:    s_and_b32 s0, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
@@ -971,18 +971,18 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ;
 ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x4c
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT:    s_mov_b32 s2, 0
+; GFX8-NEXT:    s_mov_b32 s6, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB13_2
 ; GFX8-NEXT:  ; %bb.1: ; %bb
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
@@ -990,12 +990,12 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out
 ; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:  BB13_2: ; %exit
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    s_add_u32 s0, s4, 8
-; GFX8-NEXT:    s_addc_u32 s1, s5, 0
-; GFX8-NEXT:    s_and_b32 s2, 1, s2
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_add_u32 s0, s2, 8
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    s_and_b32 s2, 1, s6
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_nop 2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 0c8010c55d530..23aa8146a8497 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -147,6 +147,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
 define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
 ; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v9, v1
+; GFX1030-NEXT:    v_mov_b32_e32 v13, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v18, v3
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v14
@@ -154,7 +158,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v16
 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v17
 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
-; GFX1030-NEXT:    image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7]
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v5, v9, v13, v18, v4, v6, v7, v8, v10, v11, v12], s[4:7]
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
@@ -163,10 +167,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
 ; GFX1030-NEXT:  ; %bb.2:
 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v18
-; GFX1030-NEXT:    v_mov_b32_e32 v1, v19
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v20
-; GFX1030-NEXT:    v_mov_b32_e32 v3, v21
 ; GFX1030-NEXT:    ; return to shader part epilog
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
@@ -207,23 +207,27 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX1030-NEXT:    v_and_b32_e32 v14, s0, v8
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX1030-NEXT:    v_and_b32_e32 v15, s0, v9
+; GFX1030-NEXT:    v_mov_b32_e32 v5, v0
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
+; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v9
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX1030-NEXT:    v_lshl_or_b32 v15, v15, 16, v8
-; GFX1030-NEXT:    v_and_or_b32 v9, v6, s0, v5
-; GFX1030-NEXT:    v_and_or_b32 v14, v7, s0, v14
+; GFX1030-NEXT:    v_and_or_b32 v6, v6, s0, v0
+; GFX1030-NEXT:    v_and_or_b32 v7, v7, s0, v1
+; GFX1030-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
 ; GFX1030-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v11
 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v12
 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v13
 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
-; GFX1030-NEXT:    image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16
+; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v5, v14, v15, v16, v4, v6, v7, v8], s[4:7] a16
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
@@ -232,10 +236,6 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-NEXT:  ; %bb.2:
 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v5
-; GFX1030-NEXT:    v_mov_b32_e32 v1, v6
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX1030-NEXT:    ; return to shader part epilog
 ;
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
@@ -279,6 +279,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
 ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
 ; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    v_mov_b32_e32 v6, v0
+; GFX1030-NEXT:    v_mov_b32_e32 v10, v1
+; GFX1030-NEXT:    v_mov_b32_e32 v14, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v19, v3
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v15
@@ -286,7 +290,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v17
 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v18
 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v6, v10, v14, v19, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
@@ -295,10 +299,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
 ; GFX1030-NEXT:  ; %bb.2:
 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v19
-; GFX1030-NEXT:    v_mov_b32_e32 v1, v20
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v21
-; GFX1030-NEXT:    v_mov_b32_e32 v3, v22
 ; GFX1030-NEXT:    ; return to shader part epilog
 ;
 ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
@@ -339,23 +339,27 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX1030-NEXT:    v_and_b32_e32 v15, s0, v9
-; GFX1030-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX1030-NEXT:    v_and_b32_e32 v16, s0, v10
+; GFX1030-NEXT:    v_mov_b32_e32 v6, v0
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
+; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v9
+; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
+; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v10
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX1030-NEXT:    v_lshl_or_b32 v16, v16, 16, v9
-; GFX1030-NEXT:    v_and_or_b32 v10, v7, s0, v6
-; GFX1030-NEXT:    v_and_or_b32 v15, v8, s0, v15
+; GFX1030-NEXT:    v_and_or_b32 v7, v7, s0, v0
+; GFX1030-NEXT:    v_and_or_b32 v8, v8, s0, v1
+; GFX1030-NEXT:    v_lshl_or_b32 v9, v3, 16, v2
 ; GFX1030-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
 ; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
 ; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
 ; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
-; GFX1030-NEXT:    image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16
+; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v6, v15, v16, v17, v4, v5, v7, v8, v9], s[4:7] a16
 ; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
 ; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
@@ -364,10 +368,6 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030-NEXT:  ; %bb.2:
 ; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-NEXT:    v_mov_b32_e32 v0, v6
-; GFX1030-NEXT:    v_mov_b32_e32 v1, v7
-; GFX1030-NEXT:    v_mov_b32_e32 v2, v8
-; GFX1030-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX1030-NEXT:    ; return to shader part epilog
 ;
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index 23cc4fb459d4d..9f7bce7a8e57c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -663,15 +663,15 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_wqm_b64 exec, exec
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; SI-NEXT:    s_movk_i32 s2, 0x3c00
-; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
-; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
-; SI-NEXT:    s_lshl_b32 s2, s4, 16
-; SI-NEXT:    s_or_b32 s2, s3, s2
-; SI-NEXT:    s_lshl_b32 s3, s3, 16
-; SI-NEXT:    s_or_b32 s3, s4, s3
+; SI-NEXT:    s_bfe_u32 s3, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s4, s3, 16
+; SI-NEXT:    s_or_b32 s4, s2, s4
+; SI-NEXT:    s_lshl_b32 s2, s2, 16
+; SI-NEXT:    s_or_b32 s5, s3, s2
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB6_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
@@ -680,9 +680,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
 ; SI-NEXT:  BB6_3: ; %.continue0
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -691,10 +691,10 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 ; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; SI-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[6:7]
 ; SI-NEXT:    s_cbranch_execz BB6_6
 ; SI-NEXT:  ; %bb.4: ; %.demote1
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
@@ -702,9 +702,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:  ; %bb.5: ; %.demote1
 ; SI-NEXT:    s_mov_b64 exec, 0
 ; SI-NEXT:  BB6_6: ; %.continue1
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  BB6_7:
@@ -893,16 +893,16 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_wqm_b64 exec, exec
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; SI-NEXT:    s_movk_i32 s2, 0x3c00
-; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
-; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
-; SI-NEXT:    s_lshl_b32 s2, s4, 16
-; SI-NEXT:    s_or_b32 s2, s3, s2
-; SI-NEXT:    s_lshl_b32 s3, s3, 16
-; SI-NEXT:    s_or_b32 s3, s4, s3
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_bfe_u32 s3, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s2, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s4, s3, 16
+; SI-NEXT:    s_or_b32 s6, s2, s4
+; SI-NEXT:    s_lshl_b32 s2, s2, 16
+; SI-NEXT:    s_or_b32 s7, s3, s2
+; SI-NEXT:    s_mov_b32 s4, 0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
@@ -911,22 +911,22 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
 ; SI-NEXT:  BB7_3: ; %.continue0.preheader
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    s_branch BB7_5
 ; SI-NEXT:  BB7_4: ; %.continue1
 ; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
-; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
-; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB7_8
 ; SI-NEXT:  BB7_5: ; %.continue0
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[6:7]
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v3, v2
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -934,10 +934,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
-; SI-NEXT:    s_and_b64 s[6:7], s[0:1], vcc
-; SI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
-; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execz BB7_4
 ; SI-NEXT:  ; %bb.6: ; %.demote1
 ; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
@@ -949,10 +949,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_branch BB7_4
 ; SI-NEXT:  BB7_8: ; %.return
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
 ; SI-NEXT:    s_endpgm
 ; SI-NEXT:  BB7_9:
@@ -1094,10 +1094,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX10-64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    s_mov_b32 s4, 0
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
@@ -1106,8 +1106,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
 ; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-64-NEXT:    s_branch BB7_5
 ; GFX10-64-NEXT:  BB7_4: ; %.continue1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 890dbf79126b4..3d495e85556bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -8,45 +8,47 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_sdiv_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, v1, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v4
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v6
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
+; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
-; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v5
+; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v7
+; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v5
+; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v3
+; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v3
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v12
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CHECK-NEXT:    v_mul_hi_u32 v14, v5, v12
+; CHECK-NEXT:    v_mul_lo_u32 v13, v3, v10
+; CHECK-NEXT:    v_mul_hi_u32 v14, v3, v12
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v6, v12
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
@@ -54,7 +56,7 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v14, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CHECK-NEXT:    v_mul_hi_u32 v13, v5, v10
+; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v10
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -65,18 +67,18 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
 ; CHECK-NEXT:    v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v5
+; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v3
+; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v3
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v13
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
-; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v12, v3, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
@@ -84,7 +86,7 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -96,99 +98,97 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
 ; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v10, v4, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v5, v8, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v5, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v2
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, 0, v10, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v2, v7, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz BB0_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v1, v5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, %den
   ret i64 %result
@@ -692,11 +692,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_sdiv_v2i64:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v9, v1
-; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v5
+; CGP-NEXT:    v_mov_b32_e32 v11, v1
+; CGP-NEXT:    v_mov_b32_e32 v10, v0
+; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v8, v2
+; CGP-NEXT:    v_mov_b32_e32 v9, v3
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
@@ -704,44 +706,44 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v5, v0, vcc
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v5, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v10
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
+; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v5
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v5
-; CGP-NEXT:    v_trunc_f32_e32 v10, v10
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_xor_b32_e32 v9, v9, v11
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v10
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v5
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v3
+; CGP-NEXT:    v_trunc_f32_e32 v11, v11
+; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v5
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v3
+; CGP-NEXT:    v_xor_b32_e32 v10, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v16
-; CGP-NEXT:    v_mul_lo_u32 v17, v5, v14
-; CGP-NEXT:    v_mul_hi_u32 v18, v5, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v16
+; CGP-NEXT:    v_mul_lo_u32 v17, v3, v14
+; CGP-NEXT:    v_mul_hi_u32 v18, v3, v16
+; CGP-NEXT:    v_mul_hi_u32 v16, v11, v16
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT:    v_mul_lo_u32 v18, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v3, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v11, v14
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
@@ -751,18 +753,18 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v13, v5
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
+; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v12, v5
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT:    v_mul_lo_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v17
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v5, v12
+; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
 ; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -770,7 +772,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
@@ -781,71 +783,71 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v10
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v10
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v5
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v1, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v9, v12, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v12
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v4
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v10, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v2
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, v10, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v5
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v2
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v10, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v4
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v4
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v14, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v4, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v11, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v4, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v5, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
+; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
-; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -859,68 +861,68 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB2_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v7
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v9, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v6
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v6, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v9
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
-; CGP-NEXT:    v_trunc_f32_e32 v8, v8
-; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v7, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v7, vcc
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
+; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
+; CGP-NEXT:    v_trunc_f32_e32 v9, v9
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v5
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v5
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v7
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v14
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v5, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
+; CGP-NEXT:    v_mul_lo_u32 v16, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
@@ -930,18 +932,18 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v11, v5
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v7
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v5
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v15
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v14, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -949,7 +951,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -960,100 +962,98 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v5
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v5
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v10
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v6
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v8, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
+; CGP-NEXT:    v_subb_u32_e32 v8, vcc, v8, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v2, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v6, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v9, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v5
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v5
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v5
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v7, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
+; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB2_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v6
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB2_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -2510,37 +2510,39 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[4:5], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_or_b32_e32 v3, v1, v5
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v4
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v6, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
+; CHECK-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; CHECK-NEXT:    v_subb_u32_e32 v9, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v8, v5
@@ -2601,18 +2603,18 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
@@ -2622,76 +2624,74 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v5
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
-; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v8
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
+; CHECK-NEXT:    v_subb_u32_e64 v9, s[4:5], v4, v8, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v8
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v4
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v2
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v9
 ; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v9, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v3
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz BB7_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v0, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB7_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v1, v3
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
@@ -2995,58 +2995,60 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v4
-; CGP-NEXT:    v_mov_b32_e32 v7, v1
-; CGP-NEXT:    v_mov_b32_e32 v5, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_mov_b32_e32 v9, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v0
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT:    v_lshl_b64 v[8:9], s[4:5], v6
+; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v0
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v0, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v10
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v6
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v6, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v11
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v10, v10
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v10
-; CGP-NEXT:    v_mul_hi_u32 v17, v12, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v6
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v3
+; CGP-NEXT:    v_trunc_f32_e32 v9, v9
+; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v9
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v9
+; CGP-NEXT:    v_mul_hi_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v16, v12, v3
+; CGP-NEXT:    v_xor_b32_e32 v8, v8, v6
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v16
-; CGP-NEXT:    v_mul_lo_u32 v17, v6, v14
-; CGP-NEXT:    v_mul_hi_u32 v18, v6, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT:    v_mul_lo_u32 v15, v9, v16
+; CGP-NEXT:    v_mul_lo_u32 v17, v3, v14
+; CGP-NEXT:    v_mul_hi_u32 v18, v3, v16
+; CGP-NEXT:    v_mul_hi_u32 v16, v9, v16
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v18, v10, v14
+; CGP-NEXT:    v_mul_lo_u32 v18, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_hi_u32 v17, v6, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v3, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
@@ -3056,18 +3058,18 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
+; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v9, v14, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v6
-; CGP-NEXT:    v_mul_hi_u32 v12, v12, v6
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
+; CGP-NEXT:    v_mul_lo_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v17
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v6, v12
+; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
 ; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -3075,7 +3077,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
@@ -3086,137 +3088,137 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v6
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, v9
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v1, v10
-; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v9
+; CGP-NEXT:    v_mul_hi_u32 v15, v1, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v7, v12, vcc
-; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v12
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v4
-; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_subb_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v2
+; CGP-NEXT:    v_subb_u32_e32 v8, vcc, v8, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v6
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v13, v2
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[4:5]
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v10, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
+; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v14, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v4, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v11, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v5
-; CGP-NEXT:    v_xor_b32_e32 v1, v4, v5
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v6, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
+; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v5, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v0, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v9
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v6
-; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v5
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v6, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v9
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v4, v4, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v8
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v3
+; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v9
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
-; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v7
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v6
 ; CGP-NEXT:    v_mul_lo_u32 v13, v10, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v7
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v6
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_mul_lo_u32 v13, v8, v14
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v7, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v6, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v6, v14
 ; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
@@ -3224,7 +3226,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v16, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v6, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
@@ -3235,18 +3237,18 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
 ; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v11, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v11, v6
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v7
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, v10, v6
 ; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v15
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v14, v6, v10
 ; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -3254,7 +3256,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v6, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -3266,99 +3268,97 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v6
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v6
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v3, v10, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v10
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v6
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v7, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v7, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v4
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v12, s[4:5]
 ; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v2, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v11
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v9, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v5
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v5
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v5
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v5, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v9, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
+; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB8_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 0c56ac3816c02..8d911d2843ab1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -1341,31 +1341,31 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
 define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64> addrspace(1)* %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
-; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_ashr_i32 s2, s13, 31
 ; GFX8-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX8-NEXT:    s_add_u32 s0, s12, s2
-; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX8-NEXT:    s_and_b32 s1, s1, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s13, s2
+; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
 ; GFX8-NEXT:    s_add_u32 s8, s8, s6
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    s_mov_b32 s7, s6
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_and_b32 s7, s7, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX8-NEXT:    s_addc_u32 s9, s9, s6
+; GFX8-NEXT:    s_add_u32 s0, s0, s12
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_and_b32 s7, s7, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX8-NEXT:    s_mov_b32 s13, s12
+; GFX8-NEXT:    s_addc_u32 s1, s1, s12
+; GFX8-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT:    s_mov_b32 s7, s6
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_mov_b32 s3, s2
-; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s16, 0, s8
+; GFX8-NEXT:    s_sub_u32 s16, 0, s14
 ; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1376,12 +1376,12 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s17, 0, s9
+; GFX8-NEXT:    s_subb_u32 s17, 0, s15
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s16, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s16, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
@@ -1438,19 +1438,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v5, s12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s13
+; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s8, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v5, s13, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
@@ -1460,33 +1460,33 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v7, s8, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
+; GFX8-NEXT:    v_mul_hi_u32 v7, s14, v0
+; GFX8-NEXT:    v_mul_lo_u32 v5, s14, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s12, v5
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v5
 ; GFX8-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s13, v2
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s14, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s14, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
@@ -1501,55 +1501,55 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    s_ashr_i32 s6, s15, 31
 ; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX8-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    s_add_u32 s0, s14, s6
+; GFX8-NEXT:    s_add_u32 s0, s10, s8
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s15, s6
-; GFX8-NEXT:    s_add_u32 s10, s10, s8
-; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX8-NEXT:    s_and_b32 s3, s3, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX8-NEXT:    s_mov_b32 s9, s8
-; GFX8-NEXT:    s_addc_u32 s11, s11, s8
-; GFX8-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
+; GFX8-NEXT:    s_addc_u32 s1, s11, s8
+; GFX8-NEXT:    s_add_u32 s2, s2, s12
+; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_and_b32 s7, s7, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX8-NEXT:    s_mov_b32 s13, s12
+; GFX8-NEXT:    s_addc_u32 s3, s3, s12
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v4
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s2, v3
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX8-NEXT:    s_mov_b32 s7, s6
+; GFX8-NEXT:    s_mov_b32 s9, s8
 ; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[0:1], s[6:7]
+; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
 ; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT:    s_sub_u32 s12, 0, s10
+; GFX8-NEXT:    s_sub_u32 s10, 0, s2
 ; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s13, 0, s11
-; GFX8-NEXT:    v_mul_lo_u32 v6, s13, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s12, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s12, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s12, v2
-; GFX8-NEXT:    v_mov_b32_e32 v10, s11
+; GFX8-NEXT:    s_subb_u32 s11, 0, s3
+; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v10, s3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v3, v8
@@ -1575,10 +1575,10 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_addc_u32_e64 v7, s[0:1], v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v8, s13, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s12, v7
-; GFX8-NEXT:    v_mul_hi_u32 v12, s12, v2
-; GFX8-NEXT:    v_mul_lo_u32 v11, s12, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT:    v_mul_hi_u32 v12, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v11, s10, v2
 ; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
 ; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v9
 ; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v12
@@ -1586,7 +1586,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v11
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
@@ -1607,19 +1606,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, s3, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s2, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, s3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, s3
+; GFX8-NEXT:    v_mul_lo_u32 v6, s7, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s6, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s6, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, s3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v9, s7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_mul_hi_u32 v7, s2, v3
-; GFX8-NEXT:    v_mul_hi_u32 v3, s3, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
@@ -1629,29 +1628,29 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
-; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v2
+; GFX8-NEXT:    v_mul_lo_u32 v6, s3, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
+; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v2
+; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s2, v9
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s6, v9
 ; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s3, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s7, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s10, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v7
 ; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
@@ -1659,7 +1658,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s10, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
 ; GFX8-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
@@ -1667,57 +1666,58 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v6
-; GFX8-NEXT:    v_mov_b32_e32 v9, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT:    v_xor_b32_e32 v7, s8, v7
+; GFX8-NEXT:    v_xor_b32_e32 v8, s8, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s8
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s13
+; GFX8-NEXT:    v_mov_b32_e32 v9, s5
+; GFX8-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s14
-; GFX8-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdivrem_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x10
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
+; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s13, 31
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX9-NEXT:    s_add_u32 s0, s12, s2
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_addc_u32 s1, s13, s2
+; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s6
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_mov_b32 s7, s6
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, s6
+; GFX9-NEXT:    s_add_u32 s0, s0, s12
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s1, s1, s12
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT:    s_mov_b32 s7, s6
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[6:7]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s16, 0, s8
+; GFX9-NEXT:    s_sub_u32 s16, 0, s14
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1728,7 +1728,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s17, 0, s9
+; GFX9-NEXT:    s_subb_u32 s17, 0, s15
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
@@ -1785,19 +1785,19 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s13, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1806,33 +1806,33 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
+; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s12, v6
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s8, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
-; GFX9-NEXT:    v_sub_u32_e32 v2, s13, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v4
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v3
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s14, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s14, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
@@ -1848,28 +1848,28 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
-; GFX9-NEXT:    s_ashr_i32 s6, s15, 31
-; GFX9-NEXT:    s_add_u32 s12, s14, s6
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_addc_u32 s13, s15, s6
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
+; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX9-NEXT:    s_add_u32 s10, s10, s8
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_and_b32 s3, s3, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    s_addc_u32 s11, s11, s8
+; GFX9-NEXT:    s_add_u32 s2, s2, s12
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_and_b32 s7, s7, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s3, s3, s12
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
-; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[12:13], s[6:7]
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v4, v4, v5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GFX9-NEXT:    s_sub_u32 s3, 0, s10
+; GFX9-NEXT:    s_sub_u32 s7, 0, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -1882,11 +1882,11 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX9-NEXT:    s_subb_u32 s14, 0, s11
+; GFX9-NEXT:    s_subb_u32 s14, 0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, s3, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v6
+; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
@@ -1895,7 +1895,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
@@ -1915,17 +1915,17 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add3_u32 v5, v9, v8, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v7
-; GFX9-NEXT:    v_mul_hi_u32 v10, s3, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, s3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v11, s7, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v9, v10
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v11
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v11
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX9-NEXT:    v_mov_b32_e32 v12, s2
+; GFX9-NEXT:    v_mov_b32_e32 v12, s6
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v9, v6
@@ -1945,20 +1945,20 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s13, v6
-; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v12, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, s12, v6
-; GFX9-NEXT:    v_mul_hi_u32 v6, s13, v6
+; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v7
+; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v7
 ; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, s12, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, s13, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
@@ -1967,30 +1967,30 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v11, s13
-; GFX9-NEXT:    v_mov_b32_e32 v9, s11
+; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
+; GFX9-NEXT:    v_mul_hi_u32 v8, s2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v10, s2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s12, v10
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v10
 ; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v11, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
-; GFX9-NEXT:    v_sub_u32_e32 v6, s13, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
+; GFX9-NEXT:    v_sub_u32_e32 v6, s11, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s10, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v7
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v12
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
@@ -2000,87 +2000,87 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
-; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT:    v_xor_b32_e32 v7, s8, v7
+; GFX9-NEXT:    v_xor_b32_e32 v8, s8, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_mov_b32_e32 v9, s6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT:    v_mov_b32_e32 v9, s8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s8, v7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
-; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_v2i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x10
-; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ashr_i32 s2, s17, 31
-; GFX10-NEXT:    s_ashr_i32 s0, s13, 31
-; GFX10-NEXT:    s_add_u32 s8, s16, s2
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_addc_u32 s9, s17, s2
-; GFX10-NEXT:    s_add_u32 s6, s12, s0
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s1, 1
-; GFX10-NEXT:    s_mov_b32 s1, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_mov_b32 s3, s2
-; GFX10-NEXT:    s_addc_u32 s7, s13, s0
-; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
-; GFX10-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX10-NEXT:    s_sub_u32 s20, 0, s6
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
+; GFX10-NEXT:    s_add_u32 s14, s8, s12
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_addc_u32 s15, s9, s12
+; GFX10-NEXT:    s_add_u32 s0, s0, s6
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-NEXT:    s_and_b32 s8, s7, 1
+; GFX10-NEXT:    s_mov_b32 s7, s6
+; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX10-NEXT:    s_addc_u32 s1, s1, s6
+; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
+; GFX10-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX10-NEXT:    s_sub_u32 s22, 0, s8
+; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX10-NEXT:    s_and_b32 s0, s0, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX10-NEXT:    s_subb_u32 s21, 0, s7
-; GFX10-NEXT:    s_ashr_i32 s10, s19, 31
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_subb_u32 s23, 0, s9
+; GFX10-NEXT:    s_ashr_i32 s16, s11, 31
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_ashr_i32 s12, s15, 31
-; GFX10-NEXT:    s_xor_b64 s[16:17], s[2:3], s[0:1]
-; GFX10-NEXT:    s_add_u32 s0, s18, s10
+; GFX10-NEXT:    s_ashr_i32 s18, s3, 31
+; GFX10-NEXT:    s_xor_b64 s[20:21], s[12:13], s[6:7]
+; GFX10-NEXT:    s_add_u32 s0, s10, s16
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    s_mov_b32 s13, s12
+; GFX10-NEXT:    s_mov_b32 s19, s18
 ; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX10-NEXT:    s_mov_b32 s11, s10
-; GFX10-NEXT:    s_addc_u32 s1, s19, s10
-; GFX10-NEXT:    s_add_u32 s14, s14, s12
-; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX10-NEXT:    s_and_b32 s3, s3, 1
+; GFX10-NEXT:    s_mov_b32 s17, s16
+; GFX10-NEXT:    s_addc_u32 s1, s11, s16
+; GFX10-NEXT:    s_add_u32 s2, s2, s18
+; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_and_b32 s6, s6, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX10-NEXT:    s_addc_u32 s15, s15, s12
-; GFX10-NEXT:    s_xor_b64 s[18:19], s[0:1], s[10:11]
-; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
+; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_addc_u32 s3, s3, s18
+; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[18:19]
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s14
-; GFX10-NEXT:    s_sub_u32 s3, 0, s14
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s2
+; GFX10-NEXT:    s_sub_u32 s6, 0, s2
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
@@ -2089,15 +2089,15 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    s_subb_u32 s22, 0, s15
+; GFX10-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s22, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s21, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s20, v0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v4, s23, v0
+; GFX10-NEXT:    v_mul_hi_u32 v5, s22, v0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s22, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v5
 ; GFX10-NEXT:    v_trunc_f32_e32 v4, v7
@@ -2112,17 +2112,17 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v8
-; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v10, v6
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v12, s22, v1
-; GFX10-NEXT:    v_mul_hi_u32 v13, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v12, s7, v1
+; GFX10-NEXT:    v_mul_hi_u32 v13, s6, v1
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
@@ -2140,15 +2140,15 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_mul_lo_u32 v14, s21, v0
+; GFX10-NEXT:    v_mul_lo_u32 v14, s23, v0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v9, v6
-; GFX10-NEXT:    v_mul_hi_u32 v15, s20, v0
+; GFX10-NEXT:    v_mul_hi_u32 v15, s22, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v11
-; GFX10-NEXT:    v_mul_lo_u32 v16, s20, v12
+; GFX10-NEXT:    v_mul_lo_u32 v16, s22, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v10
-; GFX10-NEXT:    v_mul_lo_u32 v13, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
@@ -2167,23 +2167,23 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v10, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v15, v13
-; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v1
+; GFX10-NEXT:    v_mul_lo_u32 v13, s7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s1, v4, v6, s0
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v5, v9
-; GFX10-NEXT:    v_mul_hi_u32 v15, s3, v1
+; GFX10-NEXT:    v_mul_hi_u32 v15, s6, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v16
-; GFX10-NEXT:    v_mul_lo_u32 v9, s3, v11
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v12, v14
 ; GFX10-NEXT:    v_mul_lo_u32 v12, v11, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v10, v16
-; GFX10-NEXT:    s_load_dwordx4 s[20:23], s[4:5], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_add_co_u32 v5, s1, v8, v5
 ; GFX10-NEXT:    v_add3_u32 v9, v13, v9, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
@@ -2200,22 +2200,22 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v8, v3
-; GFX10-NEXT:    v_mul_lo_u32 v8, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s15, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v13
-; GFX10-NEXT:    v_mul_lo_u32 v14, s8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v12, s8, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT:    v_mul_lo_u32 v13, s9, v2
+; GFX10-NEXT:    v_mul_lo_u32 v14, s14, v2
+; GFX10-NEXT:    v_mul_hi_u32 v12, s14, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s15, v0
+; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v10
-; GFX10-NEXT:    v_mul_hi_u32 v15, s8, v2
+; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v0, s1, v13, v0
-; GFX10-NEXT:    v_mul_hi_u32 v2, s9, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v12
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
@@ -2228,41 +2228,41 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
 ; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v6, s7, v0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v7, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s6, v2
+; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, vcc_lo, v4, v5, s0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v0
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v6, v8, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v1
-; GFX10-NEXT:    v_mul_hi_u32 v7, s19, v1
-; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s8, v5
-; GFX10-NEXT:    v_mul_lo_u32 v14, s18, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v4
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s9, v4, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v15, s19, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v5
-; GFX10-NEXT:    v_mul_hi_u32 v1, s18, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s7, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v4
-; GFX10-NEXT:    v_mul_hi_u32 v17, s18, v3
+; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
+; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s14, v5
+; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s15, v4
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v15, s11, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v5
+; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
+; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s19, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v5, s6
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v5, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v4
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s7, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v13
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v13
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v12
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
@@ -2275,7 +2275,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v1
@@ -2284,52 +2284,52 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
 ; GFX10-NEXT:    v_add3_u32 v3, v6, v1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, s15, v7
-; GFX10-NEXT:    v_mul_lo_u32 v16, s14, v7
+; GFX10-NEXT:    v_mul_lo_u32 v10, s3, v7
+; GFX10-NEXT:    v_mul_lo_u32 v16, s2, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v15, s0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s14, v3
-; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v7
+; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v3
+; GFX10-NEXT:    v_mul_hi_u32 v15, s2, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v14, s0
-; GFX10-NEXT:    v_sub_co_u32 v14, s1, v12, s6
+; GFX10-NEXT:    v_sub_co_u32 v14, s1, v12, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v6, v10, v11, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v13, v8, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, s0, s18, v16
-; GFX10-NEXT:    v_xor_b32_e32 v0, s16, v0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s19, v6, s0
+; GFX10-NEXT:    v_sub_co_u32 v8, s0, s10, v16
+; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s19, v6
-; GFX10-NEXT:    v_xor_b32_e32 v1, s17, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v10
-; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
-; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v5
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v6
+; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v10
+; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v2
+; GFX10-NEXT:    v_xor_b32_e32 v5, s12, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, vcc_lo, s15, v4, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s14
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s16
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s15, v4, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s17, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v10
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s20
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s15, v13
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v12
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v15, s0, v7, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s15, v13
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v14, s0
 ; GFX10-NEXT:    v_add_co_u32 v14, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_sub_co_u32 v11, s0, v12, s14
+; GFX10-NEXT:    v_sub_co_u32 v11, s0, v12, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v14, vcc_lo
@@ -2340,20 +2340,20 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s0
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s2
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v2, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s12
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[16:17], s[18:19]
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX10-NEXT:    v_xor_b32_e32 v6, s10, v6
-; GFX10-NEXT:    v_xor_b32_e32 v7, s10, v8
+; GFX10-NEXT:    v_xor_b32_e32 v6, s16, v6
+; GFX10-NEXT:    v_xor_b32_e32 v7, s16, v8
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s10, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s16
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s16, v7, vcc_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[20:21]
-; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[22:23]
+; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64> addrspace(1)* %out0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index e74d503c92acd..03790c0a68498 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -8,45 +8,47 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_srem_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, v1, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v4
-; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v4
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
-; CHECK-NEXT:    v_mul_lo_u32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
+; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -54,7 +56,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
@@ -65,18 +67,18 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
 ; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -84,7 +86,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
@@ -96,95 +98,93 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
 ; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
+; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v1, v5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, %den
   ret i64 %result
@@ -680,11 +680,13 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_srem_v2i64:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v9, v1
-; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v5
+; CGP-NEXT:    v_mov_b32_e32 v11, v1
+; CGP-NEXT:    v_mov_b32_e32 v10, v0
+; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v8, v2
+; CGP-NEXT:    v_mov_b32_e32 v9, v3
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
@@ -692,44 +694,44 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v5, v0, vcc
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v5, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v0, v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v4
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v9, v9
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v9
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v10, v10
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v4
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v4
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v16, v11, v2
+; CGP-NEXT:    v_mul_lo_u32 v15, v11, v2
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v15
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v9, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v17, v9, v13
+; CGP-NEXT:    v_mul_lo_u32 v17, v10, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
@@ -739,18 +741,18 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v9, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v11, v11, v4
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
+; CGP-NEXT:    v_mul_lo_u32 v16, v11, v2
+; CGP-NEXT:    v_mul_hi_u32 v11, v11, v2
+; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v16
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v2, v11
 ; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
@@ -758,7 +760,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
@@ -769,69 +771,69 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_mul_hi_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_mul_lo_u32 v11, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v1, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v8, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v8, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v1, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v5, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v0
-; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v5, v1
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v0
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v3, v1
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v4, v0, vcc
+; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v10
-; CGP-NEXT:    v_xor_b32_e32 v4, v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v10
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v10, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
-; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -845,9 +847,9 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v0, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -857,54 +859,54 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB2_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v7
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v9, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v4, v6, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v7, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
@@ -914,18 +916,18 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v6
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -933,7 +935,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -944,96 +946,94 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v7, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v4
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v5
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
+; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_xor_b32_e32 v4, v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v6
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v6, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB2_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -2474,46 +2474,48 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[4:5], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_or_b32_e32 v3, v1, v5
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v2
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v2
-; CHECK-NEXT:    v_xor_b32_e32 v2, v4, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
-; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v2, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v6, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
+; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v4
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v4
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
-; CHECK-NEXT:    v_mul_lo_u32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v11
+; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
+; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
 ; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -2521,7 +2523,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
@@ -2532,18 +2534,18 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
 ; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v9, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v4
+; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v4
+; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
 ; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
@@ -2551,7 +2553,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
@@ -2563,95 +2565,93 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
 ; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v0, v3
-; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
+; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v3
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v1, v6, vcc
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB7_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v1, v3
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
@@ -2951,58 +2951,60 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v4
-; CGP-NEXT:    v_mov_b32_e32 v7, v1
-; CGP-NEXT:    v_mov_b32_e32 v5, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_mov_b32_e32 v9, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v0
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT:    v_lshl_b64 v[8:9], s[4:5], v6
+; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v0
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v0, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v0, v4, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v10
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v10, vcc
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v8, v8
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v7
-; CGP-NEXT:    v_mul_hi_u32 v16, v11, v4
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v4
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v8
+; CGP-NEXT:    v_mul_hi_u32 v16, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v15, v9, v2
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v15
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v7, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, v15
+; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v15
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v17, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v17, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
@@ -3012,27 +3014,27 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v7, v13, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v11, v11, v4
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v13
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v2
+; CGP-NEXT:    v_mul_lo_u32 v15, v9, v14
+; CGP-NEXT:    v_mul_lo_u32 v16, v9, v2
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v2
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v16
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
 ; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v2, v9
 ; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v9
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
@@ -3041,134 +3043,134 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v11, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_lo_u32 v9, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v1, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v6, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v6, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v0
-; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v5, v1
-; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v3, v1
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
-; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v4, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v10
-; CGP-NEXT:    v_xor_b32_e32 v4, v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v10
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v10, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v5, v0
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
+; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v9
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v4
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
-; CGP-NEXT:    v_xor_b32_e32 v4, v6, v4
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
+; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v8
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v10, v6
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
-; CGP-NEXT:    v_mul_hi_u32 v14, v9, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v6
+; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v6, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
@@ -3176,7 +3178,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
@@ -3187,18 +3189,18 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v7, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v6
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
 ; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_hi_u32 v11, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
 ; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
@@ -3206,7 +3208,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
@@ -3218,95 +3220,93 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
 ; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v11, v2, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v2, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v4
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v2
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v5
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
+; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v4
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v2, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v3, v8, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    v_xor_b32_e32 v4, v2, v8
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v8
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v8, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v8
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v8
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v8
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB8_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index aad2ca5fc2811..a7efed6808256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -8,39 +8,41 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_udiv_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, v1, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v4
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v0
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v4, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_lo_u32 v13, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v14, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
@@ -55,21 +57,21 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v5, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
@@ -85,95 +87,93 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v10, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
+; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
 ; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v5, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v5, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v10, v8, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz BB0_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v1, v5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %num, %den
   ret i64 %result
@@ -628,9 +628,11 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_udiv_v2i64:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_mov_b32_e32 v9, v1
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v5
+; CGP-NEXT:    v_mov_b32_e32 v10, v0
+; CGP-NEXT:    v_mov_b32_e32 v11, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v2
+; CGP-NEXT:    v_mov_b32_e32 v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -640,8 +642,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -650,10 +652,10 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_lo_u32 v14, v1, v13
 ; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
@@ -680,93 +682,93 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
 ; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v13
 ; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
 ; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
-; CGP-NEXT:    v_mul_lo_u32 v11, v0, v10
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v15, v11
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
+; CGP-NEXT:    v_mul_lo_u32 v3, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v15, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v13, v3
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v12, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v11, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v11, v1
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v4, v0
 ; CGP-NEXT:    v_mul_lo_u32 v12, v5, v0
 ; CGP-NEXT:    v_mul_hi_u32 v13, v4, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v4, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v4, v1
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, 1, v0
 ; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v14
 ; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_subb_u32_e64 v11, s[4:5], v9, v10, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v11, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v11, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v5, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v13, v10, vcc
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v5, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v13, v11, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v12, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v16, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v15, v16, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
-; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
@@ -780,53 +782,53 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB2_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v7
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v9, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v7
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, 0, v7, vcc
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v3, v3
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v15, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
@@ -841,125 +843,123 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v5, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, v11
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v11, v8
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v11, v4
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v9, v2
+; CGP-NEXT:    v_mul_hi_u32 v5, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v9, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v9, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v11, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v5, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v6, v3
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v12
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v3, v8, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v8
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v9, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v9, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v7, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v11, v9, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v10, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB2_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v6
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB2_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -2291,41 +2291,43 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_udiv_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[4:5], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v3, v1, v5
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
-; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v5
+; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v0
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v8
-; CHECK-NEXT:    v_mul_lo_u32 v13, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v8
+; CHECK-NEXT:    v_mul_lo_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_lo_u32 v13, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v14, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
@@ -2340,22 +2342,22 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v6
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v10
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v9, v2
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
@@ -2369,96 +2371,94 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 1, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v1
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
+; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
 ; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v1, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v6
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT:    v_subb_u32_e64 v7, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v10, v8, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v11, v12, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz BB7_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v0, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB7_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v1, v3
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = udiv i64 %x, %shl.y
@@ -2731,12 +2731,14 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v5, v0
-; CGP-NEXT:    v_mov_b32_e32 v7, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v0
+; CGP-NEXT:    v_mov_b32_e32 v9, v1
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
 ; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v4
-; CGP-NEXT:    v_lshl_b64 v[8:9], s[4:5], v6
-; CGP-NEXT:    v_or_b32_e32 v1, v7, v11
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2744,10 +2746,10 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v11
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2816,13 +2818,13 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v7, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v7, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v7, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
@@ -2836,236 +2838,234 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v10, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, 1, v0
 ; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v14
 ; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v11
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v9, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v9, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v13, v7, vcc
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v14, v12, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v16, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v14, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v15, v16, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v5, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v0, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v9
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v8
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v9
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v8
-; CGP-NEXT:    v_subb_u32_e32 v7, vcc, 0, v9, vcc
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v11
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v11
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v3, v3
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v8
+; CGP-NEXT:    v_mul_lo_u32 v15, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v5, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
-; CGP-NEXT:    v_mul_lo_u32 v7, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v11, v6
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v13, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v9, v8
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v9, v4
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v6
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v13
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v8, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v2
+; CGP-NEXT:    v_mul_hi_u32 v9, v10, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v10, v3
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
 ; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v13, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v9, v7, vcc
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v10, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz BB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v8
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB8_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index fd45cafa81483..cddd490c32f89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -8,39 +8,41 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_urem_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, v1, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, v1
+; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
 ; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
-; CHECK-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v4
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v0
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v8
-; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v4, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_lo_u32 v13, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v14, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
@@ -55,21 +57,21 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v5, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v4, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
 ; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
@@ -85,92 +87,90 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
 ; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v1, v5
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v10, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v5, v2, v5
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v2, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; CHECK-NEXT:    v_mul_lo_u32 v1, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v4, v7
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v5, v0, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v5, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v0, v2
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v1, v2
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, v9, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v7, v0, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB0_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v4
-; CHECK-NEXT:    v_mov_b32_e32 v1, v5
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, %den
   ret i64 %result
@@ -620,9 +620,11 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-LABEL: v_urem_v2i64:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_mov_b32_e32 v9, v1
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v5
+; CGP-NEXT:    v_mov_b32_e32 v10, v0
+; CGP-NEXT:    v_mov_b32_e32 v11, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v2
+; CGP-NEXT:    v_mov_b32_e32 v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -632,8 +634,8 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v5
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -642,10 +644,10 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_lo_u32 v14, v1, v13
 ; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
@@ -672,73 +674,73 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
 ; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v13
 ; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
 ; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
-; CGP-NEXT:    v_mul_lo_u32 v11, v0, v10
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v15, v11
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
+; CGP-NEXT:    v_mul_lo_u32 v3, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v14, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v15, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v13, v3
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v12, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v11, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v11, v1
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v4, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v4, v0
 ; CGP-NEXT:    v_mul_lo_u32 v12, v5, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CGP-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v11
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v9, v0, vcc
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v10, v3
+; CGP-NEXT:    v_subb_u32_e64 v2, s[4:5], v11, v0, vcc
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v1, v4
 ; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v4
@@ -753,11 +755,11 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v5, v11, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
-; CGP-NEXT:    ; implicit-def: $vgpr8
+; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -771,9 +773,9 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v0, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -783,39 +785,39 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB2_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v7
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v9, v7
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v7
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v7
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
+; CGP-NEXT:    v_subb_u32_e32 v5, vcc, 0, v7, vcc
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v3, v3
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v11
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v3, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v15, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
@@ -830,122 +832,120 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v5, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v9, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, v11
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
+; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v3, v10, vcc
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v11, v8
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v9
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v11, v4
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v9, v2
+; CGP-NEXT:    v_mul_hi_u32 v5, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v9, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v9, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v5, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v5, v6, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v3, v6, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v9, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v2, v6
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v3, v6
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v10, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v9, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB2_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v6
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v6
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB2_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -1651,41 +1651,43 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_urem_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v3, v0
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[4:5], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v3, v1, v5
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CHECK-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_2
 ; CHECK-NEXT:  ; %bb.1:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v4
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v5
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
-; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v3, v3
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v5
+; CHECK-NEXT:    v_subb_u32_e32 v7, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
+; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v0
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v1, v9
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v8
-; CHECK-NEXT:    v_mul_lo_u32 v13, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v14, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v8
+; CHECK-NEXT:    v_mul_lo_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_lo_u32 v13, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v14, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
@@ -1700,22 +1702,22 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
+; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
+; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v6
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v10
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v9, v2
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
@@ -1729,93 +1731,91 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v1
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v1
+; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v1, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v0, v4
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v0
+; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v1, v5, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v3, v7
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v4, v0, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v4, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v6, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v1, v5
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
+; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v0, v6, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v5, v9, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
-; CHECK-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v6, v9, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, v7, v0, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v5, vcc
+; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
+; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz BB7_4
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, 0, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
-; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, 0, v5
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v0, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  BB7_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    v_mov_b32_e32 v0, v2
-; CHECK-NEXT:    v_mov_b32_e32 v1, v3
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = urem i64 %x, %shl.y
@@ -2086,12 +2086,14 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    v_mov_b32_e32 v5, v0
-; CGP-NEXT:    v_mov_b32_e32 v7, v1
+; CGP-NEXT:    v_mov_b32_e32 v8, v0
+; CGP-NEXT:    v_mov_b32_e32 v9, v1
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
 ; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v4
-; CGP-NEXT:    v_lshl_b64 v[8:9], s[4:5], v6
-; CGP-NEXT:    v_or_b32_e32 v1, v7, v11
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2099,10 +2101,10 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v11
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2171,13 +2173,13 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v7, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v1
-; CGP-NEXT:    v_mul_lo_u32 v13, v7, v1
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v7, v1
+; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
@@ -2191,230 +2193,228 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v10, v0
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v1, v10, v1
+; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v5, v6
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v0, vcc
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v7, v0
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v11
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v6
+; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v0, vcc
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v1, v10
-; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v1, v2
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v11
+; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v6, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v13, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
-; CGP-NEXT:    ; implicit-def: $vgpr5
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v12, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v0, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v1, v0
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v0, v5, v0
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v10
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
+; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v10
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  BB8_4:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_or_b32_e32 v5, v3, v9
-; CGP-NEXT:    v_mov_b32_e32 v4, 0
-; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; CGP-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CGP-NEXT:    v_or_b32_e32 v3, v7, v11
+; CGP-NEXT:    v_mov_b32_e32 v2, 0
+; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_6
 ; CGP-NEXT:  ; %bb.5:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v8
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v9
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v8
-; CGP-NEXT:    v_subb_u32_e32 v7, vcc, 0, v9, vcc
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v5, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v5, v11
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v11
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v10
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, 0, v11, vcc
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT:    v_trunc_f32_e32 v3, v3
+; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v9
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v8
+; CGP-NEXT:    v_mul_lo_u32 v15, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v16, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v5, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v10
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v6, v4
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
-; CGP-NEXT:    v_mul_lo_u32 v7, v4, v6
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v4, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v11, v6
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v13, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
+; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v9, v8
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v9, v4
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v6
+; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v13
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v8, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v3, v10, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
+; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v7, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v7, v2
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v2, v8
-; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v6, v8
-; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v11, v10, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; CGP-NEXT:    ; implicit-def: $vgpr2
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v3, v10
+; CGP-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v2, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v11, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v6, v10
+; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; CGP-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v2, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v4, v7, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
 ; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz BB8_8
 ; CGP-NEXT:  ; %bb.7:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, 0, v8
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v8
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v2, v8
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v2, v3, vcc
-; CGP-NEXT:    v_mov_b32_e32 v5, 0
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:  BB8_8:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CGP-NEXT:    v_mov_b32_e32 v2, v4
-; CGP-NEXT:    v_mov_b32_e32 v3, v5
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 1d774f2354399..8db0bf4e4cd82 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -1809,125 +1809,125 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x
 ;
 ; GFX6-LABEL: sdiv_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s16, 0x4f7ffffe
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s15, 0xf000
+; GFX6-NEXT:    s_mov_b32 s14, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s12, 31
-; GFX6-NEXT:    s_add_i32 s3, s12, s2
-; GFX6-NEXT:    s_xor_b32 s12, s3, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX6-NEXT:    s_ashr_i32 s3, s13, 31
-; GFX6-NEXT:    s_add_i32 s0, s13, s3
-; GFX6-NEXT:    s_xor_b32 s13, s0, s3
+; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
+; GFX6-NEXT:    s_add_i32 s3, s8, s2
+; GFX6-NEXT:    s_xor_b32 s8, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_ashr_i32 s3, s9, 31
+; GFX6-NEXT:    s_add_i32 s0, s9, s3
+; GFX6-NEXT:    s_xor_b32 s9, s0, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX6-NEXT:    s_sub_i32 s1, 0, s12
-; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX6-NEXT:    s_sub_i32 s1, 0, s8
+; GFX6-NEXT:    s_ashr_i32 s0, s4, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
-; GFX6-NEXT:    s_add_i32 s1, s8, s0
+; GFX6-NEXT:    s_add_i32 s1, s4, s0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s16, v1
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s0, 0, s13
+; GFX6-NEXT:    s_sub_i32 s0, 0, s9
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s12
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX6-NEXT:    s_add_i32 s1, s9, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX6-NEXT:    s_add_i32 s1, s5, s0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s3
-; GFX6-NEXT:    s_ashr_i32 s3, s14, 31
+; GFX6-NEXT:    s_ashr_i32 s3, s10, 31
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    s_add_i32 s0, s14, s3
-; GFX6-NEXT:    s_xor_b32 s9, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX6-NEXT:    s_add_i32 s0, s10, s3
+; GFX6-NEXT:    s_xor_b32 s5, s0, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s13
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, s16, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s9
+; GFX6-NEXT:    s_sub_i32 s0, 0, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
-; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s10, 31
-; GFX6-NEXT:    s_add_i32 s8, s15, s2
-; GFX6-NEXT:    s_add_i32 s1, s10, s0
-; GFX6-NEXT:    s_xor_b32 s8, s8, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
+; GFX6-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX6-NEXT:    s_add_i32 s4, s11, s2
+; GFX6-NEXT:    s_add_i32 s1, s6, s0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    s_xor_b32 s3, s0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s9
+; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s5
 ; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
-; GFX6-NEXT:    s_add_i32 s1, s11, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s7, 31
+; GFX6-NEXT:    s_add_i32 s1, s7, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s8
+; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v4i32:
@@ -10600,35 +10600,31 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
 ; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[4:5]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX9-NEXT:    s_sub_u32 s4, 0, s14
-; GFX9-NEXT:    s_subb_u32 s5, 0, s15
+; GFX9-NEXT:    s_sub_u32 s8, 0, s12
+; GFX9-NEXT:    s_subb_u32 s4, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
-; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, s5, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -10648,11 +10644,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GFX9-NEXT:    v_mul_lo_u32 v9, s4, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
@@ -10668,65 +10664,69 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
-; GFX9-NEXT:    s_add_u32 s2, s8, s6
-; GFX9-NEXT:    s_addc_u32 s3, s9, s6
+; GFX9-NEXT:    s_add_u32 s2, s4, s8
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[6:7]
+; GFX9-NEXT:    s_addc_u32 s3, s5, s8
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
+; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s8, v0
-; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s14, v0
+; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s14, v0
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s12, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s14, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s12, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1]
-; GFX9-NEXT:    s_ashr_i32 s0, s13, 31
-; GFX9-NEXT:    s_add_u32 s8, s12, s0
+; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX9-NEXT:    s_add_u32 s10, s10, s0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_addc_u32 s9, s13, s0
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s9
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v1
+; GFX9-NEXT:    s_addc_u32 s11, s11, s0
+; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[0:1]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s10
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s11
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX9-NEXT:    v_rcp_f32_e32 v8, v9
@@ -10739,8 +10739,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    s_sub_u32 s2, 0, s8
-; GFX9-NEXT:    s_subb_u32 s3, 0, s9
+; GFX9-NEXT:    s_sub_u32 s2, 0, s10
+; GFX9-NEXT:    s_subb_u32 s3, 0, s11
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
@@ -10757,7 +10757,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
-; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
+; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
@@ -10788,60 +10788,60 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s10, s12
+; GFX9-NEXT:    s_add_u32 s0, s6, s12
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    s_addc_u32 s1, s11, s12
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
+; GFX9-NEXT:    s_addc_u32 s1, s7, s12
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v2
-; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v10, s11, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, s11, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
+; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v2
-; GFX9-NEXT:    v_mov_b32_e32 v8, s6
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v8, s8
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[0:1], s10, v2
-; GFX9-NEXT:    v_sub_u32_e32 v4, s11, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s8, v0
+; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[0:1], s6, v2
+; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v3
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[2:3], s8, v2
+; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[2:3], s10, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v8
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s10, v7
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index cbb0424c28795..80fca55df019f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -157,65 +157,65 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xd
+; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
+; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s14, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s12, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB1_2:
-; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: add_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB1_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s1, s0, s1
+; GFX8-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s12, s6
 ; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  BB1_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -226,23 +226,23 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX9-LABEL: add_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB1_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX9-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s12, s6
 ; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -250,7 +250,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX9-NEXT:  BB1_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -262,26 +262,26 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
-; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
+; GFX1064-NEXT:    s_mov_b32 s14, -1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -289,7 +289,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
@@ -602,9 +602,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB3_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -615,10 +615,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s8, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s9, s3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB3_2:
@@ -626,11 +626,11 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
@@ -642,9 +642,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX89-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX89-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX89-NEXT:    s_cbranch_execz BB3_2
 ; GFX89-NEXT:  ; %bb.1:
@@ -655,20 +655,20 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX89-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s10, -1
 ; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v1, s2
-; GFX89-NEXT:    v_mov_b32_e32 v2, 0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s2
+; GFX89-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    buffer_wbinvl1_vol
 ; GFX89-NEXT:  BB3_2:
 ; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX89-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX89-NEXT:    v_mov_b32_e32 v1, s2
-; GFX89-NEXT:    v_mov_b32_e32 v2, s3
-; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
+; GFX89-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX89-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s2
+; GFX89-NEXT:    v_mov_b32_e32 v1, s3
+; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
 ; GFX89-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s2, -1
 ; GFX89-NEXT:    s_nop 2
@@ -679,25 +679,25 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB3_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -705,9 +705,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -717,24 +717,24 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB3_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
@@ -742,9 +742,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -762,9 +762,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -775,13 +775,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
 ; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
-; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB4_2:
@@ -789,16 +789,16 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s0, v0
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
+; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
+; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -808,42 +808,42 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB4_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s12, s6
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    s_mov_b32 s13, s7
 ; GFX8-NEXT:    s_mul_i32 s7, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s0, s6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
+; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  BB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s1, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v2
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v1
 ; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -853,9 +853,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB4_2
 ; GFX9-NEXT:  ; %bb.1:
@@ -869,23 +869,23 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX9-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  BB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -898,10 +898,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB4_2
 ; GFX1064-NEXT:  ; %bb.1:
@@ -912,14 +912,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
 ; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064-NEXT:    s_add_i32 s10, s10, s9
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s8, s6
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -927,15 +927,15 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s0, v2
 ; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -946,9 +946,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB4_2
 ; GFX1032-NEXT:  ; %bb.1:
@@ -959,14 +959,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
 ; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s9
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_mov_b32 s8, s6
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
@@ -974,15 +974,15 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v2
 ; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -1248,65 +1248,65 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xd
+; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB7_2
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
+; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s14, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s12, s6
 ; GFX7LESS-NEXT:    s_mov_b32 s13, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB7_2:
-; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: sub_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB7_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s1, s0, s1
+; GFX8-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s12, s6
 ; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  BB7_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -1317,23 +1317,23 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX9-LABEL: sub_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB7_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_mul_i32 s2, s8, s2
 ; GFX9-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s12, s6
 ; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1341,7 +1341,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX9-NEXT:  BB7_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1353,26 +1353,26 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB7_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
-; GFX1064-NEXT:    s_mov_b32 s10, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1064-NEXT:    s_mov_b32 s8, s6
-; GFX1064-NEXT:    s_mov_b32 s9, s7
+; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
+; GFX1064-NEXT:    s_mov_b32 s14, -1
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -1380,7 +1380,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
@@ -1693,9 +1693,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -1706,10 +1706,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s8, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s9, s3
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB9_2:
@@ -1717,11 +1717,11 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1733,9 +1733,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB9_2
 ; GFX8-NEXT:  ; %bb.1:
@@ -1746,18 +1746,18 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX8-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_mov_b32 s9, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  BB9_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1772,9 +1772,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB9_2
 ; GFX9-NEXT:  ; %bb.1:
@@ -1785,18 +1785,18 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_mov_b32 s9, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  BB9_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1810,25 +1810,25 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB9_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s8, s2
 ; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -1836,13 +1836,13 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -1851,24 +1851,24 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB9_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s8, s2
 ; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
@@ -1876,13 +1876,13 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -1899,9 +1899,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB10_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -1912,13 +1912,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
 ; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
-; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
+; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  BB10_2:
@@ -1926,16 +1926,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s0, v0
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
+; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
+; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v2
+; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -1945,42 +1945,42 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; GFX8-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB10_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s12, s6
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX8-NEXT:    s_mov_b32 s13, s7
 ; GFX8-NEXT:    s_mul_i32 s7, s1, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s0, s6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
 ; GFX8-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
+; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  BB10_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
-; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s1, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v2
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v1
 ; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1990,9 +1990,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB10_2
 ; GFX9-NEXT:  ; %bb.1:
@@ -2006,23 +2006,23 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX9-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX9-NEXT:    s_mov_b32 s15, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
+; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  BB10_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2035,10 +2035,10 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB10_2
 ; GFX1064-NEXT:  ; %bb.1:
@@ -2049,14 +2049,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
 ; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX1064-NEXT:    s_add_i32 s10, s10, s9
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s10
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s8, s6
 ; GFX1064-NEXT:    s_mov_b32 s9, s7
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
@@ -2064,15 +2064,15 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v2
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -2083,9 +2083,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB10_2
 ; GFX1032-NEXT:  ; %bb.1:
@@ -2096,14 +2096,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
 ; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s8
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s9
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_mov_b32 s8, s6
 ; GFX1032-NEXT:    s_mov_b32 s9, s7
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
+; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
@@ -2111,15 +2111,15 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v2
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index d3c2ef94c9238..dfacab48c45d3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -176,30 +176,30 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
+; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_add_rtn_u32 v1, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB1_2:
-; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -207,28 +207,28 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX8-LABEL: add_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB1_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s1, s0, s1
+; GFX8-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_add_rtn_u32 v1, v1, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB1_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -239,27 +239,27 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX9-LABEL: add_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB1_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_u32 v1, v1, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB1_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -271,20 +271,20 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_add_rtn_u32 v1, v1, v2
@@ -294,7 +294,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
@@ -735,27 +735,27 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
+; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB4_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -769,28 +769,28 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB4_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX8-NEXT:    s_mul_i32 s4, s4, 5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
+; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB4_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_nop 2
@@ -802,27 +802,27 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB4_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
+; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB4_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_nop 2
@@ -833,28 +833,28 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB4_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
+; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB4_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -865,27 +865,27 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB4_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v2, v[1:2]
+; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB4_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -905,9 +905,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB5_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -915,14 +915,14 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
-; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
+; GFX7LESS-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB5_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -931,15 +931,15 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s4, s0
 ; GFX7LESS-NEXT:    s_mov_b32 s5, s1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
+; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
+; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -948,41 +948,41 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB5_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    s_mul_i32 s7, s3, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
+; GFX8-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB5_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v1
 ; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -991,9 +991,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB5_2
 ; GFX9-NEXT:  ; %bb.1:
@@ -1003,24 +1003,24 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s7
 ; GFX9-NEXT:    s_mul_i32 s6, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
+; GFX9-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB5_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
 ; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1032,10 +1032,10 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB5_2
 ; GFX1064-NEXT:  ; %bb.1:
@@ -1046,25 +1046,25 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
 ; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064-NEXT:    s_add_i32 s8, s8, s7
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
+; GFX1064-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB5_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s2, v0
+; GFX1064-NEXT:    v_add_co_u32 v0, vcc, s2, v2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1074,9 +1074,9 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB5_2
 ; GFX1032-NEXT:  ; %bb.1:
@@ -1087,25 +1087,25 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
 ; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
 ; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX1032-NEXT:    s_add_i32 s7, s7, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_rtn_u64 v[1:2], v3, v[1:2]
+; GFX1032-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB5_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1032-NEXT:    v_add_co_u32 v0, vcc_lo, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1347,30 +1347,30 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GFX7LESS-NEXT:    s_load_dword s6, s[0:1], 0xb
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
-; GFX7LESS-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB8_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
+; GFX7LESS-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_sub_rtn_u32 v1, v1, v2
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB8_2:
-; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1378,28 +1378,28 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX8-LABEL: sub_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x2c
 ; GFX8-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB8_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s1, s0, s1
+; GFX8-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_sub_rtn_u32 v1, v1, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB8_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -1410,27 +1410,27 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX9-LABEL: sub_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB8_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_mul_i32 s2, s6, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_sub_rtn_u32 v1, v1, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB8_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1442,20 +1442,20 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
 ; GFX1064-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1064-NEXT:    s_load_dword s6, s[0:1], 0x2c
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB8_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s3, s[6:7]
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s3, s2, s3
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s3
+; GFX1064-NEXT:    s_mul_i32 s2, s6, s2
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1064-NEXT:    ds_sub_rtn_u32 v1, v1, v2
@@ -1465,7 +1465,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX1064-NEXT:    v_mul_lo_u32 v0, s6, v0
 ; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s6, -1
@@ -1906,27 +1906,27 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s4, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s5, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s5, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB11_2
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX7LESS-NEXT:    s_mul_i32 s4, s4, 5
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
+; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB11_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -1940,27 +1940,27 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB11_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX8-NEXT:    s_mul_i32 s4, s4, 5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
+; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB11_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1974,26 +1974,26 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB11_2
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
 ; GFX9-NEXT:    s_mul_i32 s4, s4, 5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
+; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB11_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
@@ -2006,32 +2006,32 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s5, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB11_2
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    s_mul_i32 s4, s4, 5
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
+; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB11_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
-; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
+; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2041,31 +2041,31 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB11_2
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s3, s3
-; GFX1032-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    s_mul_i32 s3, s3, 5
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v2, v[1:2]
+; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB11_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v1
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
-; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
+; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2084,9 +2084,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
-; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
+; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz BB12_2
 ; GFX7LESS-NEXT:  ; %bb.1:
@@ -2094,14 +2094,14 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mul_i32 s7, s3, s6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX7LESS-NEXT:    s_mul_i32 s6, s2, s6
-; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
+; GFX7LESS-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:  BB12_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -2110,15 +2110,15 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s4, s0
 ; GFX7LESS-NEXT:    s_mov_b32 s5, s1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s1
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s3, v2
+; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s2, v2
+; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v2
+; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -2127,41 +2127,41 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz BB12_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX8-NEXT:    s_mul_i32 s7, s3, s6
 ; GFX8-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
+; GFX8-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:  BB12_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, s0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v0, s2, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GFX8-NEXT:    s_mov_b32 s5, s1
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v2
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v1
 ; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2170,9 +2170,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz BB12_2
 ; GFX9-NEXT:  ; %bb.1:
@@ -2182,24 +2182,24 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s7
 ; GFX9-NEXT:    s_mul_i32 s6, s2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
+; GFX9-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:  BB12_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GFX9-NEXT:    s_mov_b32 s4, s0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v2
 ; GFX9-NEXT:    s_mov_b32 s5, s1
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -2211,10 +2211,10 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz BB12_2
 ; GFX1064-NEXT:  ; %bb.1:
@@ -2225,25 +2225,25 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s6
 ; GFX1064-NEXT:    s_mul_i32 s6, s2, s6
 ; GFX1064-NEXT:    s_add_i32 s8, s8, s7
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s8
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
+; GFX1064-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  BB12_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1064-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1064-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
+; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
 ; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -2253,9 +2253,9 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1032:       ; %bb.0: ; %entry
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
-; GFX1032-NEXT:    ; implicit-def: $vgpr1_vgpr2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz BB12_2
 ; GFX1032-NEXT:  ; %bb.1:
@@ -2266,25 +2266,25 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
 ; GFX1032-NEXT:    s_mul_hi_u32 s7, s2, s5
 ; GFX1032-NEXT:    s_mul_i32 s5, s2, s5
 ; GFX1032-NEXT:    s_add_i32 s7, s7, s6
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1032-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_rtn_u64 v[1:2], v3, v[1:2]
+; GFX1032-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  BB12_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX1032-NEXT:    v_mul_hi_u32 v4, s2, v2
+; GFX1032-NEXT:    v_mul_lo_u32 v2, s2, v2
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v4, v3
-; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
+; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
 ; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index ca714ea03ca97..bfa1b9502aaf9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1520,11 +1520,11 @@ define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace
 ; VI-LABEL: ctpop_i16_in_br:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s0, 16
-; VI-NEXT:    v_cmp_ne_u16_e64 s[2:3], s1, 0
-; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    v_cmp_ne_u16_e64 s[0:1], s0, 0
+; VI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; VI-NEXT:    s_cbranch_vccz BB14_2
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
@@ -1537,7 +1537,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace
 ; VI-NEXT:  BB14_2:
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:  BB14_3: ; %if
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff
 ; VI-NEXT:    s_bcnt1_i32_b32 s0, s0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index babb18f08576e..fbe5155933419 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -20,46 +20,46 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA:       ; %bb.0: ; %start
 ; ISA-NEXT:    v_readfirstlane_b32 s0, v0
 ; ISA-NEXT:    s_mov_b32 m0, s0
-; ISA-NEXT:    s_mov_b32 s0, 0
+; ISA-NEXT:    s_mov_b32 s8, 0
 ; ISA-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
 ; ISA-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
-; ISA-NEXT:    s_mov_b64 s[2:3], 0
-; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    s_mov_b64 s[0:1], 0
 ; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; ISA-NEXT:    s_branch BB0_3
 ; ISA-NEXT:  BB0_1: ; %Flow1
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT:    s_add_i32 s0, s0, 1
-; ISA-NEXT:    s_mov_b64 s[8:9], 0
+; ISA-NEXT:    s_or_b64 exec, exec, s[6:7]
+; ISA-NEXT:    s_add_i32 s8, s8, 1
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
 ; ISA-NEXT:  BB0_2: ; %Flow
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
-; ISA-NEXT:    s_or_b64 s[2:3], s[10:11], s[2:3]
-; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
-; ISA-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; ISA-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; ISA-NEXT:    s_and_b64 s[10:11], exec, s[4:5]
+; ISA-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
+; ISA-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; ISA-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; ISA-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; ISA-NEXT:    s_cbranch_execz BB0_6
 ; ISA-NEXT:  BB0_3: ; %loop
 ; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], exec
-; ISA-NEXT:    s_cmp_lt_u32 s0, 32
-; ISA-NEXT:    s_mov_b64 s[8:9], -1
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_cmp_lt_u32 s8, 32
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
 ; ISA-NEXT:    s_cbranch_scc0 BB0_2
 ; ISA-NEXT:  ; %bb.4: ; %endif1
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_mov_b64 s[6:7], -1
-; ISA-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; ISA-NEXT:    s_mov_b64 s[4:5], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; ISA-NEXT:    s_cbranch_execz BB0_1
 ; ISA-NEXT:  ; %bb.5: ; %endif2
 ; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; ISA-NEXT:    s_branch BB0_1
 ; ISA-NEXT:  BB0_6: ; %Flow2
-; ISA-NEXT:    s_or_b64 exec, exec, s[2:3]
+; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; ISA-NEXT:    v_mov_b32_e32 v1, 0
-; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
+; ISA-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
 ; ISA-NEXT:  ; %bb.7: ; %if1
 ; ISA-NEXT:    v_sqrt_f32_e32 v1, v0
 ; ISA-NEXT:  ; %bb.8: ; %endloop

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
index a802249a6583b..408e00f1f37d9 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@@ -10,18 +10,18 @@
 define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
   ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr
   ; GCN: bb.0.begin:
-  ; GCN:   liveins: $sgpr7, $sgpr30_sgpr31
-  ; GCN:   $sgpr7 = frame-setup COPY $sgpr33
+  ; GCN:   liveins: $sgpr11, $sgpr30_sgpr31
+  ; GCN:   $sgpr11 = frame-setup COPY $sgpr33
   ; GCN:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN: bb.1.lp_end:
-  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; GCN: bb.2.lp_begin:
-  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr30_sgpr31
   ; GCN: bb.3.Flow:
-  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
+  ; GCN:   liveins: $sgpr10, $sgpr11, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; GCN: bb.4.end:
-  ; GCN:   liveins: $sgpr7, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31
-  ; GCN:   $sgpr33 = frame-destroy COPY $sgpr7
+  ; GCN:   liveins: $sgpr11, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31
+  ; GCN:   $sgpr33 = frame-destroy COPY $sgpr11
 begin:
   br label %lp_begin
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 785add35fc197..a7eaaf83f23a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -989,23 +989,23 @@ main_body:
 define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double addrspace(3)* %ptr) #4 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX90A-NEXT:    s_mov_b64 s[2:3], 0
+; GFX90A-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX90A-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX90A-NEXT:  BB52_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_f64 v[2:3], v[0:1], 4.0
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
-; GFX90A-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_execnz BB52_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index 88f467725dc56..b436992a0684d 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -14,22 +14,22 @@ define amdgpu_kernel void @test_move_load_address_to_vgpr(i32 addrspace(1)* noca
 ; GCN-LABEL: test_move_load_address_to_vgpr:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dword v1, v2, s[0:1] glc
+; GCN-NEXT:    global_load_dword v0, v1, s[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_add_u32_e32 v0, 0xffffff00, v1
-; GCN-NEXT:    v_lshlrev_b64 v[1:2], 2, v[1:2]
-; GCN-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
-; GCN-NEXT:    v_addc_co_u32_e32 v2, vcc, v3, v2, vcc
+; GCN-NEXT:    v_add_u32_e32 v2, 0xffffff00, v0
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:  BB0_1: ; %bb3
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    global_load_dword v3, v[1:2], off glc
+; GCN-NEXT:    global_load_dword v3, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    v_add_co_u32_e64 v1, s[0:1], 4, v1
-; GCN-NEXT:    v_addc_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
+; GCN-NEXT:    v_add_co_u32_e64 v0, s[0:1], 4, v0
+; GCN-NEXT:    v_addc_co_u32_e64 v1, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-NEXT:    s_cbranch_vccz BB0_1
 ; GCN-NEXT:  ; %bb.2: ; %bb2

diff  --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
index af1f574733c5a..87cc6b50aca9c 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
@@ -1,6 +1,13 @@
-#  FIXME: The allocator emits an error on allocation failure, but it also produces verifier errors
-# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=greedy,0 -o - 2>&1 %s | FileCheck %s
-# CHECK: error: ran out of registers during register allocation
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,0 -stop-after=virtregrewriter,0 -o -  %s | FileCheck %s
+
+# This testcase used to fail due to introducing a spill of an SGPR
+# 1024 for every subregister use inside the loop. With overlapping
+# unspillable split ranges, it wasn't able to allocate one of the
+# tuples. We avoid this by ensuring wide tuples are always allocated
+# first (although the allocator should probably have been smart enough
+# to handle this without that hint. Ideally it would understand we
+# only need to spill/restore single subregisters at a time).
 
 ---
 name:            greedy_fail_alloc_sgpr1024_spill
@@ -16,6 +23,101 @@ machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
   occupancy:       6
 body:             |
+  ; CHECK-LABEL: name: greedy_fail_alloc_sgpr1024_spill
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK:   renamable $sgpr34_sgpr35 = COPY $sgpr8_sgpr9
+  ; CHECK:   renamable $sgpr33 = COPY $sgpr15
+  ; CHECK:   renamable $sgpr42 = COPY $sgpr14
+  ; CHECK:   renamable $sgpr36_sgpr37 = COPY $sgpr10_sgpr11
+  ; CHECK:   renamable $sgpr38_sgpr39 = COPY $sgpr6_sgpr7
+  ; CHECK:   renamable $sgpr40_sgpr41 = COPY $sgpr4_sgpr5
+  ; CHECK:   renamable $sgpr66_sgpr67 = S_LOAD_DWORDX2_IMM renamable $sgpr34_sgpr35, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+  ; CHECK:   renamable $sgpr44 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr45 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr46 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr47 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr48 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr49 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr50 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr51 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr52 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr53 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr54 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr55 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr56 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr57 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr58 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr59 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr60 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr61 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr62 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr63 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr64 = S_MOV_B32 0
+  ; CHECK:   renamable $sgpr68_sgpr69 = IMPLICIT_DEF
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK:   dead $sgpr30_sgpr31 = SI_CALL renamable $sgpr68_sgpr69, 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK:   $sgpr4_sgpr5 = COPY killed renamable $sgpr40_sgpr41
+  ; CHECK:   $sgpr6_sgpr7 = COPY killed renamable $sgpr38_sgpr39
+  ; CHECK:   $sgpr8_sgpr9 = COPY killed renamable $sgpr34_sgpr35
+  ; CHECK:   $sgpr10_sgpr11 = COPY killed renamable $sgpr36_sgpr37
+  ; CHECK:   $sgpr12 = COPY killed renamable $sgpr42
+  ; CHECK:   $sgpr13 = COPY killed renamable $sgpr33
+  ; CHECK:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr68_sgpr69, 0, csr_amdgpu_highregs, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK:   renamable $sgpr4_sgpr5 = COPY $exec, implicit-def $exec
+  ; CHECK:   dead renamable $sgpr6_sgpr7 = IMPLICIT_DEF
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; CHECK:   liveins: $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF, $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F
+  ; CHECK:   renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
+  ; CHECK:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; CHECK: bb.2:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   liveins: $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF, $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F
+  ; CHECK:   [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+  ; CHECK:   renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc
+  ; CHECK:   dead [[COPY]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK: bb.3:
+  ; CHECK:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF, $sgpr4_sgpr5, $sgpr66_sgpr67:0x000000000000000F
+  ; CHECK:   renamable $sgpr6_sgpr7 = S_OR_SAVEEXEC_B64 renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK:   renamable $sgpr68 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr69 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr70 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr71 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr72 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr73 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr74 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr75 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr76 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr77 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr78 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr79 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr80 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr81 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr82 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr83 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr84 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr85 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr86 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr87 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr88 = COPY renamable $sgpr44
+  ; CHECK:   renamable $sgpr89 = COPY renamable $sgpr44
+  ; CHECK:   dead %18:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
+  ; CHECK:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
+  ; CHECK:   S_CBRANCH_EXECZ %bb.5, implicit $exec
+  ; CHECK:   S_BRANCH %bb.1
+  ; CHECK: bb.4:
+  ; CHECK:   successors: %bb.5(0x80000000)
+  ; CHECK:   liveins: $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75:0x000003FFFFFFFFFF, $sgpr6_sgpr7, $sgpr66_sgpr67:0x0000000000000003
+  ; CHECK:   $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
+  ; CHECK:   dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc
+  ; CHECK:   dead %16:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+  ; CHECK: bb.5:
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15
 

diff  --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 9e6bd99f99c65..e146d13d8eb40 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -4,47 +4,47 @@
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 ; SI-LABEL: i1_copy_from_loop:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s14, 0
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr8_sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; SI-NEXT:    s_branch BB0_3
 ; SI-NEXT:  BB0_1: ; %Flow1
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; SI-NEXT:  BB0_2: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    s_and_b64 s[14:15], exec, s[10:11]
-; SI-NEXT:    s_or_b64 s[4:5], s[14:15], s[4:5]
-; SI-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
-; SI-NEXT:    s_and_b64 s[12:13], s[12:13], exec
-; SI-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; SI-NEXT:    s_and_b64 s[12:13], exec, s[8:9]
+; SI-NEXT:    s_or_b64 s[4:5], s[12:13], s[4:5]
+; SI-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; SI-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; SI-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz BB0_6
 ; SI-NEXT:  BB0_3: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_or_b64 s[10:11], s[10:11], exec
-; SI-NEXT:    s_cmp_gt_u32 s6, 3
-; SI-NEXT:    v_cmp_lt_u32_e64 s[12:13], s6, 4
+; SI-NEXT:    s_or_b64 s[8:9], s[8:9], exec
+; SI-NEXT:    s_cmp_gt_u32 s14, 3
+; SI-NEXT:    v_cmp_lt_u32_e64 s[10:11], s14, 4
 ; SI-NEXT:    s_cbranch_scc1 BB0_2
 ; SI-NEXT:  ; %bb.4: ; %mid.loop
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s14
 ; SI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen
-; SI-NEXT:    s_mov_b64 s[12:13], -1
+; SI-NEXT:    s_mov_b64 s[10:11], -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
-; SI-NEXT:    s_mov_b64 s[10:11], -1
-; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_mov_b64 s[8:9], -1
+; SI-NEXT:    s_and_saveexec_b64 s[12:13], vcc
 ; SI-NEXT:    s_cbranch_execz BB0_1
 ; SI-NEXT:  ; %bb.5: ; %end.loop
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    s_add_i32 s6, s6, 1
-; SI-NEXT:    s_xor_b64 s[10:11], exec, -1
+; SI-NEXT:    s_add_i32 s14, s14, 1
+; SI-NEXT:    s_xor_b64 s[8:9], exec, -1
 ; SI-NEXT:    s_branch BB0_1
 ; SI-NEXT:  BB0_6: ; %for.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[8:9]
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[6:7]
 ; SI-NEXT:    s_cbranch_execz BB0_8
 ; SI-NEXT:  ; %bb.7: ; %if
 ; SI-NEXT:    exp mrt0 v0, v0, v0, v0 done vm

diff  --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 6f430fa4c2af8..76cfcb726d6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -5,85 +5,85 @@
 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: udiv32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_sub_i32 s3, 0, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_sub_i32 s4, 0, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB0_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s4, v2
 ; GFX9-NEXT:    v_not_b32_e32 v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, s5, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, s4, v5
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    s_add_u32 s4, s4, 1
+; GFX9-NEXT:    s_add_u32 s2, s2, 1
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    s_addc_u32 s5, s5, 0
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX9-NEXT:    s_cmpk_eq_i32 s2, 0x400
 ; GFX9-NEXT:    s_cbranch_scc0 BB0_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT:    s_sub_i32 s3, 0, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s5, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  BB0_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s3, v2
+; GFX10-NEXT:    v_mul_lo_u32 v4, s5, v2
 ; GFX10-NEXT:    v_not_b32_e32 v3, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s2, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s4, v4
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s4, v3
-; GFX10-NEXT:    s_add_u32 s4, s4, 1
-; GFX10-NEXT:    s_addc_u32 s5, s5, 0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, s2, v3
+; GFX10-NEXT:    s_add_u32 s2, s2, 1
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT:    s_cmpk_eq_i32 s2, 0x400
 ; GFX10-NEXT:    s_cbranch_scc0 BB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -107,81 +107,81 @@ bb3:                                              ; preds = %bb3, %bb
 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 ; GFX9-LABEL: urem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_sub_i32 s3, 0, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB1_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v2
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, s4, v2
-; GFX9-NEXT:    s_add_u32 s4, s4, 1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, s2, v2
+; GFX9-NEXT:    s_add_u32 s2, s2, 1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    s_addc_u32 s5, s5, 0
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX9-NEXT:    s_cmpk_eq_i32 s2, 0x400
 ; GFX9-NEXT:    s_cbranch_scc0 BB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: urem32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT:    s_sub_i32 s3, 0, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, s5, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  BB1_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v2
 ; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s4, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v2, s4, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, s4, v2
-; GFX10-NEXT:    s_add_u32 s4, s4, 1
-; GFX10-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s2, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    s_add_u32 s2, s2, 1
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s4, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT:    s_cmpk_eq_i32 s2, 0x400
 ; GFX10-NEXT:    s_cbranch_scc0 BB1_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -211,37 +211,37 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s2
-; GFX9-NEXT:    s_xor_b32 s3, s3, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
+; GFX9-NEXT:    s_xor_b32 s4, s3, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_sub_i32 s3, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  BB2_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v2, s3
+; GFX9-NEXT:    v_mul_hi_u32 v2, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v2, s4
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_sub_u32_e32 v3, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s3, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX9-NEXT:    s_add_i32 s4, s4, 1
+; GFX9-NEXT:    s_add_i32 s3, s3, 1
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -407,34 +407,35 @@ bb3:                                              ; preds = %bb3, %bb
 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_movk_i32 s5, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s3, s2, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_movk_i32 s3, 0x400
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    s_and_b32 s2, s4, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  BB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v2, s2, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s2, v5
+; GFX9-NEXT:    v_mul_f32_e32 v0, v8, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_mad_f32 v2, -v2, v0, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, v0
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_mad_f32 v0, -v0, v2, v8
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v0|, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    global_store_short v[5:6], v2, off
+; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -445,29 +446,29 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s1, s4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  BB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v2, s1, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, s1, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v7, v3
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    v_mad_f32 v7, -v2, v0, v7
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v7|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, 0, v2, s0
-; GFX10-NEXT:    global_store_short v[5:6], v2, off
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_mad_f32 v7, -v0, v2, v7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v7|, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v0, s0, 0, v0, s0
+; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz BB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -492,35 +493,35 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0xffff
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_movk_i32 s8, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s5, s4, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_and_b32 s7, s6, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  BB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v2, s4, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v2
-; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
+; GFX9-NEXT:    v_and_b32_e32 v0, s6, v4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s8, v4
-; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v1
+; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
-; GFX9-NEXT:    v_mad_f32 v8, -v9, v0, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-NEXT:    v_mad_f32 v8, -v9, v2, v8
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s5
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s6, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s7
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v8
-; GFX9-NEXT:    global_store_short v[5:6], v2, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
+; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -531,31 +532,31 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    s_mov_b32 s1, 0xffff
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s1, s4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  BB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v2, s1, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, s1, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v3
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
 ; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX10-NEXT:    v_mad_f32 v7, -v8, v0, v7
+; GFX10-NEXT:    v_mad_f32 v7, -v8, v2, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v0
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s4
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v7
-; GFX10-NEXT:    global_store_short v[5:6], v2, off
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
+; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz BB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -580,37 +581,38 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-LABEL: sdiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_movk_i32 s3, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_movk_i32 s5, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s2, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    s_sext_i32_i16 s4, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  BB6_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_bfe_i32 v5, v4, 0, 16
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v9, v5
-; GFX9-NEXT:    v_xor_b32_e32 v8, s2, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
+; GFX9-NEXT:    v_xor_b32_e32 v8, s4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v1
+; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v8
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v8
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v8, v7
-; GFX9-NEXT:    v_mad_f32 v7, -v7, v0, v9
+; GFX9-NEXT:    v_mad_f32 v7, -v7, v2, v9
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v7|, |v0|
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
+; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v7|, |v2|
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v0, v8, v0
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    global_store_short v[5:6], v2, off
+; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -620,34 +622,34 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  BB6_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_bfe_i32 v5, v4, 0, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v7, v5
 ; GFX10-NEXT:    v_xor_b32_e32 v8, s4, v5
-; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX10-NEXT:    v_mul_f32_e32 v2, v7, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v7, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v8, 30, v8
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_or_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX10-NEXT:    v_mad_f32 v7, -v2, v0, v7
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s1, |v7|, |v0|
+; GFX10-NEXT:    v_mad_f32 v7, -v0, v2, v7
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s1, |v7|, |v2|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, v8, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v7
-; GFX10-NEXT:    global_store_short v[5:6], v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v7
+; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz BB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -672,39 +674,39 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX9-LABEL: srem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_movk_i32 s7, 0x400
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    s_sext_i32_i16 s6, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX9-NEXT:  BB7_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    v_bfe_i32 v7, v4, 0, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v10, v7
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX9-NEXT:    v_xor_b32_e32 v9, s4, v7
-; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v9
-; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-NEXT:    v_xor_b32_e32 v9, s6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v11, v9
-; GFX9-NEXT:    v_mad_f32 v9, -v9, v0, v10
-; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v9|, |v0|
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[2:3]
-; GFX9-NEXT:    v_add_u32_e32 v2, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX9-NEXT:    v_mad_f32 v9, -v9, v2, v10
+; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v9|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v0, v11, v0
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v8, s7
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s6, v5
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, v7, v2
+; GFX9-NEXT:    v_sub_u32_e32 v0, v7, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
-; GFX9-NEXT:    global_store_short v[5:6], v2, off
+; GFX9-NEXT:    global_store_short v[5:6], v0, off
 ; GFX9-NEXT:    s_cbranch_vccz BB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -714,36 +716,36 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i16 s1, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, s1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
 ; GFX10-NEXT:  BB7_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_bfe_i32 v7, v4, 0, 16
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
 ; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v5, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v7
-; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v1
+; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v3
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
 ; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_mad_f32 v5, -v8, v0, v5
+; GFX10-NEXT:    v_mad_f32 v5, -v8, v2, v5
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v8, v8
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0|
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[2:3]
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
 ; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v8, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v9
 ; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX10-NEXT:    v_mul_lo_u32 v2, v2, s1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v7, v2
-; GFX10-NEXT:    global_store_short v[5:6], v2, off
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v7, v0
+; GFX10-NEXT:    global_store_short v[5:6], v0, off
 ; GFX10-NEXT:    s_cbranch_vccz BB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 8f584eced4c35..cdda3490a9307 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -210,40 +210,40 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
-; GCN-NEXT:    v_writelane_b32 v40, s38, 3
-; GCN-NEXT:    v_writelane_b32 v40, s39, 4
-; GCN-NEXT:    v_writelane_b32 v40, s40, 5
-; GCN-NEXT:    v_writelane_b32 v40, s41, 6
-; GCN-NEXT:    v_writelane_b32 v40, s42, 7
-; GCN-NEXT:    v_writelane_b32 v40, s43, 8
-; GCN-NEXT:    v_writelane_b32 v40, s44, 9
-; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
 ; GCN-NEXT:    v_writelane_b32 v40, s46, 11
 ; GCN-NEXT:    v_writelane_b32 v40, s47, 12
 ; GCN-NEXT:    v_writelane_b32 v40, s48, 13
 ; GCN-NEXT:    v_writelane_b32 v40, s49, 14
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 15
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 16
-; GCN-NEXT:    s_mov_b32 s34, s14
-; GCN-NEXT:    s_mov_b32 s35, s13
-; GCN-NEXT:    s_mov_b32 s36, s12
-; GCN-NEXT:    s_mov_b64 s[38:39], s[10:11]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
-; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
+; GCN-NEXT:    s_mov_b32 s42, s14
+; GCN-NEXT:    s_mov_b32 s43, s13
+; GCN-NEXT:    s_mov_b32 s44, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[40:41]
-; GCN-NEXT:    s_mov_b64 s[10:11], s[38:39]
-; GCN-NEXT:    s_mov_b32 s12, s36
-; GCN-NEXT:    s_mov_b32 s13, s35
-; GCN-NEXT:    s_mov_b32 s14, s34
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s44
+; GCN-NEXT:    s_mov_b32 s13, s43
+; GCN-NEXT:    s_mov_b32 s14, s42
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
@@ -257,14 +257,14 @@ define void @test_indirect_call_vgpr_ptr(void()* %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s48, v40, 13
 ; GCN-NEXT:    v_readlane_b32 s47, v40, 12
 ; GCN-NEXT:    v_readlane_b32 s46, v40, 11
-; GCN-NEXT:    v_readlane_b32 s45, v40, 10
-; GCN-NEXT:    v_readlane_b32 s44, v40, 9
-; GCN-NEXT:    v_readlane_b32 s43, v40, 8
-; GCN-NEXT:    v_readlane_b32 s42, v40, 7
-; GCN-NEXT:    v_readlane_b32 s41, v40, 6
-; GCN-NEXT:    v_readlane_b32 s40, v40, 5
-; GCN-NEXT:    v_readlane_b32 s39, v40, 4
-; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
@@ -292,27 +292,27 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
-; GCN-NEXT:    v_writelane_b32 v40, s38, 3
-; GCN-NEXT:    v_writelane_b32 v40, s39, 4
-; GCN-NEXT:    v_writelane_b32 v40, s40, 5
-; GCN-NEXT:    v_writelane_b32 v40, s41, 6
-; GCN-NEXT:    v_writelane_b32 v40, s42, 7
-; GCN-NEXT:    v_writelane_b32 v40, s43, 8
-; GCN-NEXT:    v_writelane_b32 v40, s44, 9
-; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
 ; GCN-NEXT:    v_writelane_b32 v40, s46, 11
 ; GCN-NEXT:    v_writelane_b32 v40, s47, 12
 ; GCN-NEXT:    v_writelane_b32 v40, s48, 13
 ; GCN-NEXT:    v_writelane_b32 v40, s49, 14
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 15
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 16
-; GCN-NEXT:    s_mov_b32 s34, s14
-; GCN-NEXT:    s_mov_b32 s35, s13
-; GCN-NEXT:    s_mov_b32 s36, s12
-; GCN-NEXT:    s_mov_b64 s[38:39], s[10:11]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
-; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
+; GCN-NEXT:    s_mov_b32 s42, s14
+; GCN-NEXT:    s_mov_b32 s43, s13
+; GCN-NEXT:    s_mov_b32 s44, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
@@ -320,13 +320,13 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[40:41]
-; GCN-NEXT:    s_mov_b64 s[10:11], s[38:39]
-; GCN-NEXT:    s_mov_b32 s12, s36
-; GCN-NEXT:    s_mov_b32 s13, s35
-; GCN-NEXT:    s_mov_b32 s14, s34
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s44
+; GCN-NEXT:    s_mov_b32 s13, s43
+; GCN-NEXT:    s_mov_b32 s14, s42
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
@@ -340,14 +340,14 @@ define void @test_indirect_call_vgpr_ptr_arg(void(i32)* %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s48, v40, 13
 ; GCN-NEXT:    v_readlane_b32 s47, v40, 12
 ; GCN-NEXT:    v_readlane_b32 s46, v40, 11
-; GCN-NEXT:    v_readlane_b32 s45, v40, 10
-; GCN-NEXT:    v_readlane_b32 s44, v40, 9
-; GCN-NEXT:    v_readlane_b32 s43, v40, 8
-; GCN-NEXT:    v_readlane_b32 s42, v40, 7
-; GCN-NEXT:    v_readlane_b32 s41, v40, 6
-; GCN-NEXT:    v_readlane_b32 s40, v40, 5
-; GCN-NEXT:    v_readlane_b32 s39, v40, 4
-; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
@@ -375,40 +375,40 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
-; GCN-NEXT:    v_writelane_b32 v40, s38, 3
-; GCN-NEXT:    v_writelane_b32 v40, s39, 4
-; GCN-NEXT:    v_writelane_b32 v40, s40, 5
-; GCN-NEXT:    v_writelane_b32 v40, s41, 6
-; GCN-NEXT:    v_writelane_b32 v40, s42, 7
-; GCN-NEXT:    v_writelane_b32 v40, s43, 8
-; GCN-NEXT:    v_writelane_b32 v40, s44, 9
-; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
 ; GCN-NEXT:    v_writelane_b32 v40, s46, 11
 ; GCN-NEXT:    v_writelane_b32 v40, s47, 12
 ; GCN-NEXT:    v_writelane_b32 v40, s48, 13
 ; GCN-NEXT:    v_writelane_b32 v40, s49, 14
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 15
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 16
-; GCN-NEXT:    s_mov_b32 s34, s14
-; GCN-NEXT:    s_mov_b32 s35, s13
-; GCN-NEXT:    s_mov_b32 s36, s12
-; GCN-NEXT:    s_mov_b64 s[38:39], s[10:11]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
-; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
+; GCN-NEXT:    s_mov_b32 s42, s14
+; GCN-NEXT:    s_mov_b32 s43, s13
+; GCN-NEXT:    s_mov_b32 s44, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s16, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[40:41]
-; GCN-NEXT:    s_mov_b64 s[10:11], s[38:39]
-; GCN-NEXT:    s_mov_b32 s12, s36
-; GCN-NEXT:    s_mov_b32 s13, s35
-; GCN-NEXT:    s_mov_b32 s14, s34
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s44
+; GCN-NEXT:    s_mov_b32 s13, s43
+; GCN-NEXT:    s_mov_b32 s14, s42
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -424,14 +424,14 @@ define i32 @test_indirect_call_vgpr_ptr_ret(i32()* %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s48, v40, 13
 ; GCN-NEXT:    v_readlane_b32 s47, v40, 12
 ; GCN-NEXT:    v_readlane_b32 s46, v40, 11
-; GCN-NEXT:    v_readlane_b32 s45, v40, 10
-; GCN-NEXT:    v_readlane_b32 s44, v40, 9
-; GCN-NEXT:    v_readlane_b32 s43, v40, 8
-; GCN-NEXT:    v_readlane_b32 s42, v40, 7
-; GCN-NEXT:    v_readlane_b32 s41, v40, 6
-; GCN-NEXT:    v_readlane_b32 s40, v40, 5
-; GCN-NEXT:    v_readlane_b32 s39, v40, 4
-; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0
@@ -460,27 +460,27 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s35, 1
 ; GCN-NEXT:    v_writelane_b32 v40, s36, 2
-; GCN-NEXT:    v_writelane_b32 v40, s38, 3
-; GCN-NEXT:    v_writelane_b32 v40, s39, 4
-; GCN-NEXT:    v_writelane_b32 v40, s40, 5
-; GCN-NEXT:    v_writelane_b32 v40, s41, 6
-; GCN-NEXT:    v_writelane_b32 v40, s42, 7
-; GCN-NEXT:    v_writelane_b32 v40, s43, 8
-; GCN-NEXT:    v_writelane_b32 v40, s44, 9
-; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s38, 4
+; GCN-NEXT:    v_writelane_b32 v40, s39, 5
+; GCN-NEXT:    v_writelane_b32 v40, s40, 6
+; GCN-NEXT:    v_writelane_b32 v40, s41, 7
+; GCN-NEXT:    v_writelane_b32 v40, s42, 8
+; GCN-NEXT:    v_writelane_b32 v40, s43, 9
+; GCN-NEXT:    v_writelane_b32 v40, s44, 10
 ; GCN-NEXT:    v_writelane_b32 v40, s46, 11
 ; GCN-NEXT:    v_writelane_b32 v40, s47, 12
 ; GCN-NEXT:    v_writelane_b32 v40, s48, 13
 ; GCN-NEXT:    v_writelane_b32 v40, s49, 14
 ; GCN-NEXT:    v_writelane_b32 v40, s50, 15
 ; GCN-NEXT:    v_writelane_b32 v40, s51, 16
-; GCN-NEXT:    s_mov_b32 s34, s14
-; GCN-NEXT:    s_mov_b32 s35, s13
-; GCN-NEXT:    s_mov_b32 s36, s12
-; GCN-NEXT:    s_mov_b64 s[38:39], s[10:11]
-; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
-; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
-; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
+; GCN-NEXT:    s_mov_b32 s42, s14
+; GCN-NEXT:    s_mov_b32 s43, s13
+; GCN-NEXT:    s_mov_b32 s44, s12
+; GCN-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
@@ -494,13 +494,13 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    v_readfirstlane_b32 s17, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[50:51], vcc
-; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[40:41]
-; GCN-NEXT:    s_mov_b64 s[10:11], s[38:39]
-; GCN-NEXT:    s_mov_b32 s12, s36
-; GCN-NEXT:    s_mov_b32 s13, s35
-; GCN-NEXT:    s_mov_b32 s14, s34
+; GCN-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GCN-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GCN-NEXT:    s_mov_b32 s12, s44
+; GCN-NEXT:    s_mov_b32 s13, s43
+; GCN-NEXT:    s_mov_b32 s14, s42
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    ; implicit-def: $vgpr31
@@ -518,14 +518,14 @@ define void @test_indirect_call_vgpr_ptr_in_branch(void()* %fptr, i1 %cond) {
 ; GCN-NEXT:    v_readlane_b32 s48, v40, 13
 ; GCN-NEXT:    v_readlane_b32 s47, v40, 12
 ; GCN-NEXT:    v_readlane_b32 s46, v40, 11
-; GCN-NEXT:    v_readlane_b32 s45, v40, 10
-; GCN-NEXT:    v_readlane_b32 s44, v40, 9
-; GCN-NEXT:    v_readlane_b32 s43, v40, 8
-; GCN-NEXT:    v_readlane_b32 s42, v40, 7
-; GCN-NEXT:    v_readlane_b32 s41, v40, 6
-; GCN-NEXT:    v_readlane_b32 s40, v40, 5
-; GCN-NEXT:    v_readlane_b32 s39, v40, 4
-; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s44, v40, 10
+; GCN-NEXT:    v_readlane_b32 s43, v40, 9
+; GCN-NEXT:    v_readlane_b32 s42, v40, 8
+; GCN-NEXT:    v_readlane_b32 s41, v40, 7
+; GCN-NEXT:    v_readlane_b32 s40, v40, 6
+; GCN-NEXT:    v_readlane_b32 s39, v40, 5
+; GCN-NEXT:    v_readlane_b32 s38, v40, 4
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
 ; GCN-NEXT:    v_readlane_b32 s36, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s35, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index aaa014eb50c1d..45d1749c8d587 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1447,53 +1447,53 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %
 define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 ; SI-LABEL: insert_split_bb:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s0, s[4:5], 0x4
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x4
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_cbranch_scc0 BB30_2
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_load_dword s1, s[6:7], 0x1
-; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_load_dword s7, s[2:3], 0x1
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccz BB30_3
 ; SI-NEXT:    s_branch BB30_4
 ; SI-NEXT:  BB30_2:
 ; SI-NEXT:  BB30_3: ; %if
-; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; SI-NEXT:  BB30_4: ; %endif
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    s_mov_b32 s7, 0x100f000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: insert_split_bb:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s0, s[4:5], 0x10
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, 0
 ; VI-NEXT:    s_cbranch_scc0 BB30_2
 ; VI-NEXT:  ; %bb.1: ; %else
-; VI-NEXT:    s_load_dword s1, s[6:7], 0x4
+; VI-NEXT:    s_load_dword s7, s[2:3], 0x4
 ; VI-NEXT:    s_cbranch_execz BB30_3
 ; VI-NEXT:    s_branch BB30_4
 ; VI-NEXT:  BB30_2:
 ; VI-NEXT:  BB30_3: ; %if
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; VI-NEXT:  BB30_4: ; %endif
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 s7, 0x1100f000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index cff597bf77f38..0dfc0c7c7049e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -891,45 +891,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    s_wqm_b64 exec, exec
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB7_9
 ; SI-NEXT:  ; %bb.2: ; %.demote0
-; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
-; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
+; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; SI-NEXT:  BB7_3: ; %.continue0.preheader
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_mov_b64 s[2:3], 0
 ; SI-NEXT:    s_branch BB7_5
 ; SI-NEXT:  BB7_4: ; %.continue1
 ; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
-; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SI-NEXT:    s_add_i32 s2, s2, 1
-; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
-; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_add_i32 s6, s6, 1
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
+; SI-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB7_8
 ; SI-NEXT:  BB7_5: ; %.continue0
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v2, v0
-; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; SI-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
 ; SI-NEXT:    s_nop 0
 ; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 ; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execz BB7_4
 ; SI-NEXT:  ; %bb.6: ; %.demote1
 ; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
@@ -941,7 +941,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_branch BB7_4
 ; SI-NEXT:  BB7_8: ; %.return
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; SI-NEXT:    v_bfrev_b32_e32 v0, 60
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
@@ -957,45 +957,45 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execz BB7_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
-; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_branch BB7_5
 ; GFX9-NEXT:  BB7_4: ; %.continue1
 ; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_add_i32 s2, s2, 1
-; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_add_i32 s6, s6, 1
+; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execz BB7_8
 ; GFX9-NEXT:  BB7_5: ; %.continue0
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    s_nop 1
 ; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 ; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; GFX9-NEXT:    s_cbranch_execz BB7_4
 ; GFX9-NEXT:  ; %bb.6: ; %.demote1
 ; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
@@ -1007,7 +1007,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
 ; GFX9-NEXT:    s_branch BB7_4
 ; GFX9-NEXT:  BB7_8: ; %.return
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -1023,10 +1023,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-32-NEXT:    s_mov_b32 s1, 0
+; GFX10-32-NEXT:    s_mov_b32 s2, 0
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-32-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
@@ -1035,30 +1035,30 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    s_wqm_b32 s3, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
 ; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
-; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-32-NEXT:    s_mov_b32 s2, 0
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_mov_b32 s3, 0
 ; GFX10-32-NEXT:    s_branch BB7_5
 ; GFX10-32-NEXT:  BB7_4: ; %.continue1
 ; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
-; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
-; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s3, v1
+; GFX10-32-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    s_cbranch_execz BB7_8
 ; GFX10-32-NEXT:  BB7_5: ; %.continue0
 ; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-32-NEXT:    s_mov_b32 s3, s0
-; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
-; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
+; GFX10-32-NEXT:    s_mov_b32 s1, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s3, 0, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
 ; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 ; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
-; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
-; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
-; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
+; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
+; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s4
 ; GFX10-32-NEXT:    s_cbranch_execz BB7_4
 ; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
 ; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
@@ -1070,7 +1070,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
 ; GFX10-32-NEXT:    s_branch BB7_4
 ; GFX10-32-NEXT:  BB7_8: ; %.return
-; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -1086,42 +1086,42 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX10-64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    s_mov_b32 s6, 0
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
-; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
-; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-64-NEXT:    s_branch BB7_5
 ; GFX10-64-NEXT:  BB7_4: ; %.continue1
 ; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10-64-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
-; GFX10-64-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_add_i32 s6, s6, 1
+; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
+; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_cbranch_execz BB7_8
 ; GFX10-64-NEXT:  BB7_5: ; %.continue0
 ; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-64-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s[6:7]
-; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s6, 0, s[4:5]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[0:1], -1
 ; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
 ; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
-; GFX10-64-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
-; GFX10-64-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX10-64-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; GFX10-64-NEXT:    s_cbranch_execz BB7_4
 ; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
 ; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
@@ -1133,7 +1133,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
 ; GFX10-64-NEXT:    s_branch BB7_4
 ; GFX10-64-NEXT:  BB7_8: ; %.return
-; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 62c9ab28bf7da..948aa61360cbf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -237,111 +237,111 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
 define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 ; SI-LABEL: round_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x11
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
+; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s14, -1
 ; SI-NEXT:    s_movk_i32 s18, 0xfc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
+; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
 ; SI-NEXT:    s_add_i32 s19, s0, s18
-; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s2, s14
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s19
 ; SI-NEXT:    s_brev_b32 s20, 1
-; SI-NEXT:    s_andn2_b64 s[16:17], s[10:11], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s11, s20
+; SI-NEXT:    s_andn2_b64 s[16:17], s[6:7], s[0:1]
+; SI-NEXT:    s_and_b32 s0, s7, s20
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
 ; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s19, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s19, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
+; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s0, s5, 0xb0014
 ; SI-NEXT:    s_add_i32 s17, s0, s18
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s16, -2
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_andn2_b64 s[10:11], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s9, s20
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[0:1]
+; SI-NEXT:    s_and_b32 s0, s5, s20
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s17, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s17, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s0, s15, 0xb0014
-; SI-NEXT:    s_add_i32 s10, s0, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
+; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s0, s18
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s6
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[0:1]
+; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[0:1]
 ; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
-; SI-NEXT:    s_and_b32 s0, s15, s20
+; SI-NEXT:    s_and_b32 s0, s11, s20
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s0
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s11
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s6, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; SI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s0, s13, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v10, s15
-; SI-NEXT:    s_add_i32 s8, s0, s18
+; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
+; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
+; SI-NEXT:    v_mov_b32_e32 v10, s11
+; SI-NEXT:    s_add_i32 s4, s0, s18
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s8
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s4
 ; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_andn2_b64 s[2:3], s[12:13], s[0:1]
-; SI-NEXT:    s_and_b32 s0, s13, s20
+; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
+; SI-NEXT:    s_and_b32 s0, s9, s20
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v5, s0
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s8, 0
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s8, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s9
+; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s4, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s2
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s12
+; SI-NEXT:    v_mov_b32_e32 v10, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; SI-NEXT:    v_add_f64 v[10:11], s[12:13], -v[4:5]
-; SI-NEXT:    v_mov_b32_e32 v13, s13
+; SI-NEXT:    v_add_f64 v[10:11], s[8:9], -v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v13, s9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v12, s16, v12, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
 ; SI-NEXT:    v_mov_b32_e32 v10, 0
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
 ; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s15, 0xf000
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v4f64:
@@ -396,166 +396,166 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
 define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
 ; SI-LABEL: round_v8f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x19
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s7, 0xfc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
+; SI-NEXT:    s_mov_b32 s22, -1
+; SI-NEXT:    s_movk_i32 s23, 0xfc01
+; SI-NEXT:    s_mov_b32 s21, 0xfffff
+; SI-NEXT:    s_mov_b32 s20, s22
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s26, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s26
+; SI-NEXT:    s_bfe_u32 s2, s7, 0xb0014
+; SI-NEXT:    s_add_i32 s26, s2, s23
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s26
 ; SI-NEXT:    s_brev_b32 s27, 1
-; SI-NEXT:    s_andn2_b64 s[24:25], s[10:11], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s11, s27
+; SI-NEXT:    s_andn2_b64 s[24:25], s[6:7], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s7, s27
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s25
 ; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s26, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s26, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s24
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s25, s2, s7
+; SI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s2, s5, 0xb0014
+; SI-NEXT:    s_add_i32 s25, s2, s23
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s24, -2
 ; SI-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_bfi_b32 v4, s24, v18, v4
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s25
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s25
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_andn2_b64 s[10:11], s[8:9], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s9, s27
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s5, s27
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s11
+; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s25, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s25, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    s_add_i32 s6, s2, s23
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v6, s24, v18, v6
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s15, s27
+; SI-NEXT:    s_andn2_b64 s[4:5], s[10:11], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s11, s27
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s11
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s6, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s10
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
-; SI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s2, s13, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v8, s15
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
+; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
+; SI-NEXT:    v_mov_b32_e32 v8, s11
+; SI-NEXT:    s_add_i32 s6, s2, s23
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v8, s24, v18, v8
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v8, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[12:13], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s13, s27
+; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s9, s27
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s9
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s6, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v8, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[2:3]
-; SI-NEXT:    v_add_f64 v[8:9], s[12:13], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s2, s19, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    v_add_f64 v[8:9], s[8:9], -v[4:5]
+; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
+; SI-NEXT:    v_mov_b32_e32 v10, s9
+; SI-NEXT:    s_add_i32 s6, s2, s23
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v10, s24, v18, v10
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s6
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v10, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[18:19], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s19, s27
+; SI-NEXT:    s_andn2_b64 s[4:5], s[14:15], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s15, s27
 ; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v9, s2
-; SI-NEXT:    v_mov_b32_e32 v8, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    v_mov_b32_e32 v8, s5
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s19
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v9, s15
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s6, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v13, v8, v9, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v8, s8
+; SI-NEXT:    v_mov_b32_e32 v8, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s14
 ; SI-NEXT:    v_cndmask_b32_e64 v12, v8, v9, s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s17, 0xb0014
-; SI-NEXT:    s_add_i32 s12, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s12
-; SI-NEXT:    s_andn2_b64 s[8:9], s[16:17], s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s23, 0xb0014
-; SI-NEXT:    s_add_i32 s14, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s14
-; SI-NEXT:    v_mov_b32_e32 v8, s19
-; SI-NEXT:    s_andn2_b64 s[10:11], s[22:23], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s23, s27
+; SI-NEXT:    s_bfe_u32 s2, s13, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s2, s23
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s8
+; SI-NEXT:    s_andn2_b64 s[4:5], s[12:13], s[2:3]
+; SI-NEXT:    s_bfe_u32 s2, s19, 0xb0014
+; SI-NEXT:    s_add_i32 s10, s2, s23
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s10
+; SI-NEXT:    v_mov_b32_e32 v8, s15
+; SI-NEXT:    s_andn2_b64 s[6:7], s[18:19], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s19, s27
 ; SI-NEXT:    v_bfi_b32 v19, s24, v18, v8
 ; SI-NEXT:    v_mov_b32_e32 v9, s2
-; SI-NEXT:    v_mov_b32_e32 v8, s11
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s14, 0
+; SI-NEXT:    v_mov_b32_e32 v8, s7
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s23
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s14, 51
+; SI-NEXT:    v_mov_b32_e32 v9, s19
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v8, s10
+; SI-NEXT:    v_mov_b32_e32 v8, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s22
+; SI-NEXT:    v_mov_b32_e32 v10, s18
 ; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s21, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s7
-; SI-NEXT:    s_andn2_b64 s[4:5], s[20:21], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s21, s27
+; SI-NEXT:    s_bfe_u32 s2, s17, 0xb0014
+; SI-NEXT:    s_add_i32 s10, s2, s23
+; SI-NEXT:    s_lshr_b64 s[2:3], s[20:21], s10
+; SI-NEXT:    s_andn2_b64 s[6:7], s[16:17], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s17, s27
 ; SI-NEXT:    v_mov_b32_e32 v11, s2
-; SI-NEXT:    v_mov_b32_e32 v10, s5
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s7, 0
+; SI-NEXT:    v_mov_b32_e32 v10, s7
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s7, 51
-; SI-NEXT:    v_mov_b32_e32 v11, s21
+; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v15, v10, v11, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v10, s4
+; SI-NEXT:    v_mov_b32_e32 v10, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v11, s20
+; SI-NEXT:    v_mov_b32_e32 v11, s16
 ; SI-NEXT:    v_cndmask_b32_e64 v14, v10, v11, s[2:3]
-; SI-NEXT:    v_add_f64 v[10:11], s[20:21], -v[14:15]
-; SI-NEXT:    v_mov_b32_e32 v17, s23
+; SI-NEXT:    v_add_f64 v[10:11], s[16:17], -v[14:15]
+; SI-NEXT:    v_mov_b32_e32 v17, s19
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
-; SI-NEXT:    v_add_f64 v[10:11], s[22:23], -v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v16, s21
+; SI-NEXT:    v_add_f64 v[10:11], s[18:19], -v[8:9]
+; SI-NEXT:    v_mov_b32_e32 v16, s17
 ; SI-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v17, s24, v18, v17
 ; SI-NEXT:    v_cndmask_b32_e64 v11, 0, v17, s[2:3]
@@ -564,26 +564,26 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_add_f64 v[10:11], v[8:9], v[10:11]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v16, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    s_and_b32 s13, s17, s27
+; SI-NEXT:    s_and_b32 s9, s13, s27
 ; SI-NEXT:    v_add_f64 v[8:9], v[14:15], v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v14, s9
-; SI-NEXT:    v_mov_b32_e32 v15, s13
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s12, 0
+; SI-NEXT:    v_mov_b32_e32 v14, s5
+; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s8, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; SI-NEXT:    v_mov_b32_e32 v15, s17
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s12, 51
+; SI-NEXT:    v_mov_b32_e32 v15, s13
+; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s8, 51
 ; SI-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v14, s8
+; SI-NEXT:    v_mov_b32_e32 v14, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v15, s16
+; SI-NEXT:    v_mov_b32_e32 v15, s12
 ; SI-NEXT:    v_cndmask_b32_e64 v16, v14, v15, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v14, s17
+; SI-NEXT:    v_mov_b32_e32 v14, s13
 ; SI-NEXT:    v_bfi_b32 v18, s24, v18, v14
-; SI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[16:17]
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[16:17]
+; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; SI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[12:13]
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[12:13]
+; SI-NEXT:    s_mov_b32 s23, 0xf000
 ; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
 ; SI-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
@@ -592,78 +592,78 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; SI-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-NEXT:    v_add_f64 v[12:13], v[16:17], v[12:13]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
-; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48
+; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v8f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x19
-; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x19
+; CI-NEXT:    s_brev_b32 s18, -2
 ; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s19, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; CI-NEXT:    v_bfi_b32 v4, s2, v16, v4
+; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v4, s3
+; CI-NEXT:    v_add_f64 v[2:3], s[2:3], -v[0:1]
+; CI-NEXT:    v_bfi_b32 v4, s18, v16, v4
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[8:9]
+; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[0:1]
 ; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; CI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[4:5]
-; CI-NEXT:    v_mov_b32_e32 v6, s9
+; CI-NEXT:    v_add_f64 v[0:1], s[0:1], -v[4:5]
+; CI-NEXT:    v_mov_b32_e32 v6, s1
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
-; CI-NEXT:    v_bfi_b32 v6, s2, v16, v6
+; CI-NEXT:    v_bfi_b32 v6, s18, v16, v6
 ; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[14:15]
+; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[6:7]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
-; CI-NEXT:    v_add_f64 v[4:5], s[14:15], -v[6:7]
-; CI-NEXT:    v_mov_b32_e32 v8, s15
+; CI-NEXT:    v_add_f64 v[4:5], s[6:7], -v[6:7]
+; CI-NEXT:    v_mov_b32_e32 v8, s7
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v8, s2, v16, v8
+; CI-NEXT:    v_bfi_b32 v8, s18, v16, v8
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[12:13]
+; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[4:5]
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
-; CI-NEXT:    v_add_f64 v[4:5], s[12:13], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v10, s13
+; CI-NEXT:    v_add_f64 v[4:5], s[4:5], -v[8:9]
+; CI-NEXT:    v_mov_b32_e32 v10, s5
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v10, s2, v16, v10
+; CI-NEXT:    v_bfi_b32 v10, s18, v16, v10
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v10, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_add_f64 v[4:5], v[8:9], v[4:5]
-; CI-NEXT:    v_mov_b32_e32 v8, s19
-; CI-NEXT:    v_bfi_b32 v18, s2, v16, v8
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[20:21]
-; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[22:23]
-; CI-NEXT:    v_add_f64 v[14:15], s[20:21], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v19, s23
+; CI-NEXT:    v_mov_b32_e32 v8, s11
+; CI-NEXT:    v_bfi_b32 v18, s18, v16, v8
+; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[12:13]
+; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[14:15]
+; CI-NEXT:    v_add_f64 v[14:15], s[12:13], -v[8:9]
+; CI-NEXT:    v_mov_b32_e32 v19, s15
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[22:23], -v[10:11]
-; CI-NEXT:    v_mov_b32_e32 v17, s21
+; CI-NEXT:    v_add_f64 v[14:15], s[14:15], -v[10:11]
+; CI-NEXT:    v_mov_b32_e32 v17, s13
 ; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v19
-; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[16:17]
-; CI-NEXT:    v_bfi_b32 v17, s2, v16, v17
+; CI-NEXT:    v_bfi_b32 v19, s18, v16, v19
+; CI-NEXT:    v_trunc_f64_e32 v[12:13], s[8:9]
+; CI-NEXT:    v_bfi_b32 v17, s18, v16, v17
 ; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
 ; CI-NEXT:    v_add_f64 v[10:11], v[10:11], v[14:15]
 ; CI-NEXT:    v_cndmask_b32_e32 v15, 0, v17, vcc
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
-; CI-NEXT:    v_mov_b32_e32 v17, s17
+; CI-NEXT:    v_mov_b32_e32 v17, s9
+; CI-NEXT:    v_bfi_b32 v19, s18, v16, v17
 ; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[14:15]
-; CI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[12:13]
-; CI-NEXT:    v_bfi_b32 v19, s2, v16, v17
-; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[18:19]
+; CI-NEXT:    v_add_f64 v[14:15], s[8:9], -v[12:13]
+; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[10:11]
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[16:17]
-; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_add_f64 v[14:15], s[10:11], -v[16:17]
+; CI-NEXT:    s_mov_b32 s18, -1
 ; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
 ; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
@@ -671,10 +671,10 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
 ; CI-NEXT:    v_cndmask_b32_e32 v17, 0, v19, vcc
 ; CI-NEXT:    v_mov_b32_e32 v16, 0
 ; CI-NEXT:    v_add_f64 v[12:13], v[12:13], v[16:17]
-; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
-; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:48
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:32
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; CI-NEXT:    s_endpgm
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index b0d686be188f5..ffe5784af1855 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2893,28 +2893,32 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspa
 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, 0xffff
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x10
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x10
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s5, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s4, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s7, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s6, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s9, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s11, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s13, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s15, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s17, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s16, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s19, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s53, s18, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s18, s1, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s19, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s21, s3, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s22, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s23, s5, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s24, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s25, s7, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s26, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s27, s9, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s28, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s29, s11, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s30, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s31, s13, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s33, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s34, s15, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s35, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s52, s1, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s53, s0, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s54, s3, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s55, s2, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s5, s5, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s4, s4, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s7, s7, s20
@@ -2927,25 +2931,21 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s12, s12, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s15, s15, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s14, s14, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s17, s17, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s16, s16, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s19, s19, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s18, s18, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s54, s37, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s55, s36, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s56, s39, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s57, s38, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s58, s41, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s59, s40, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s60, s43, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s61, s42, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s62, s45, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s63, s44, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s64, s47, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s65, s46, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s66, s49, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s67, s48, s20
-; GCN-NOHSA-SI-NEXT:    s_and_b32 s68, s51, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s56, s37, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s57, s36, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s58, s39, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s59, s38, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s60, s41, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s61, s40, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s62, s43, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s63, s42, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s64, s45, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s65, s44, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s66, s47, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s67, s46, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s68, s49, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s69, s48, s20
+; GCN-NOHSA-SI-NEXT:    s_and_b32 s70, s51, s20
 ; GCN-NOHSA-SI-NEXT:    s_and_b32 s20, s50, s20
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s37, s37, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s36, 16
@@ -2963,31 +2963,33 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s51, s51, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s50, 16
 ; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s43, s43, 16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s50
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s67
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s69
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s66
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s68
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s49
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s65
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s67
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s46
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s64
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s66
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s47
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s63
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s65
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s64
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s61
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s63
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s42
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s60
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s59
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s62
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s61
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s43
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s40
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s58
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s60
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s41
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224
@@ -2996,220 +2998,220 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s57
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s59
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s56
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s58
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s57
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s56
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s37
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s52
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s34
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s33
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s31
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s30
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s55
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s54
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s53
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s52
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_mov_b32 s37, 0xffff
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_lshr_b32 s20, s5, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s21, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s22, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s23, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s9, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s25, s8, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s26, s11, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s27, s10, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s28, s13, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s29, s12, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s30, s15, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s31, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s33, s17, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s34, s16, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s35, s19, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s36, s18, 16
-; GCN-HSA-NEXT:    s_and_b32 s38, s5, s37
-; GCN-HSA-NEXT:    s_and_b32 s39, s4, s37
-; GCN-HSA-NEXT:    s_and_b32 s40, s7, s37
-; GCN-HSA-NEXT:    s_and_b32 s41, s6, s37
-; GCN-HSA-NEXT:    s_and_b32 s42, s9, s37
-; GCN-HSA-NEXT:    s_and_b32 s43, s8, s37
-; GCN-HSA-NEXT:    s_and_b32 s44, s11, s37
-; GCN-HSA-NEXT:    s_and_b32 s45, s10, s37
-; GCN-HSA-NEXT:    s_and_b32 s46, s13, s37
-; GCN-HSA-NEXT:    s_and_b32 s47, s12, s37
-; GCN-HSA-NEXT:    s_and_b32 s48, s15, s37
-; GCN-HSA-NEXT:    s_and_b32 s49, s14, s37
-; GCN-HSA-NEXT:    s_and_b32 s50, s17, s37
-; GCN-HSA-NEXT:    s_and_b32 s51, s16, s37
-; GCN-HSA-NEXT:    s_and_b32 s52, s19, s37
-; GCN-HSA-NEXT:    s_and_b32 s53, s18, s37
-; GCN-HSA-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x10
+; GCN-HSA-NEXT:    s_lshr_b32 s20, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s21, s0, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s22, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s23, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s24, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s25, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s26, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s27, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s28, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s29, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s30, s11, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s31, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s33, s13, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s34, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s35, s15, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s36, s14, 16
+; GCN-HSA-NEXT:    s_and_b32 s38, s1, s37
+; GCN-HSA-NEXT:    s_and_b32 s39, s0, s37
+; GCN-HSA-NEXT:    s_and_b32 s40, s3, s37
+; GCN-HSA-NEXT:    s_and_b32 s41, s2, s37
+; GCN-HSA-NEXT:    s_and_b32 s42, s5, s37
+; GCN-HSA-NEXT:    s_and_b32 s43, s4, s37
+; GCN-HSA-NEXT:    s_and_b32 s44, s7, s37
+; GCN-HSA-NEXT:    s_and_b32 s45, s6, s37
+; GCN-HSA-NEXT:    s_and_b32 s46, s9, s37
+; GCN-HSA-NEXT:    s_and_b32 s47, s8, s37
+; GCN-HSA-NEXT:    s_and_b32 s48, s11, s37
+; GCN-HSA-NEXT:    s_and_b32 s49, s10, s37
+; GCN-HSA-NEXT:    s_and_b32 s50, s13, s37
+; GCN-HSA-NEXT:    s_and_b32 s51, s12, s37
+; GCN-HSA-NEXT:    s_and_b32 s52, s15, s37
+; GCN-HSA-NEXT:    s_and_b32 s53, s14, s37
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_and_b32 s59, s8, s37
-; GCN-HSA-NEXT:    s_and_b32 s60, s11, s37
-; GCN-HSA-NEXT:    s_and_b32 s61, s10, s37
-; GCN-HSA-NEXT:    s_and_b32 s62, s13, s37
-; GCN-HSA-NEXT:    s_and_b32 s63, s12, s37
-; GCN-HSA-NEXT:    s_and_b32 s64, s15, s37
-; GCN-HSA-NEXT:    s_and_b32 s65, s14, s37
-; GCN-HSA-NEXT:    s_and_b32 s66, s17, s37
-; GCN-HSA-NEXT:    s_and_b32 s67, s16, s37
-; GCN-HSA-NEXT:    s_and_b32 s68, s19, s37
-; GCN-HSA-NEXT:    s_and_b32 s54, s5, s37
-; GCN-HSA-NEXT:    s_and_b32 s55, s4, s37
-; GCN-HSA-NEXT:    s_and_b32 s56, s7, s37
-; GCN-HSA-NEXT:    s_and_b32 s57, s6, s37
-; GCN-HSA-NEXT:    s_and_b32 s58, s9, s37
-; GCN-HSA-NEXT:    s_and_b32 s37, s18, s37
+; GCN-HSA-NEXT:    s_and_b32 s57, s4, s37
+; GCN-HSA-NEXT:    s_and_b32 s58, s7, s37
+; GCN-HSA-NEXT:    s_and_b32 s59, s6, s37
+; GCN-HSA-NEXT:    s_and_b32 s60, s9, s37
+; GCN-HSA-NEXT:    s_and_b32 s61, s8, s37
+; GCN-HSA-NEXT:    s_and_b32 s62, s11, s37
+; GCN-HSA-NEXT:    s_and_b32 s63, s10, s37
+; GCN-HSA-NEXT:    s_and_b32 s64, s13, s37
+; GCN-HSA-NEXT:    s_and_b32 s65, s12, s37
+; GCN-HSA-NEXT:    s_and_b32 s66, s15, s37
+; GCN-HSA-NEXT:    s_and_b32 s54, s3, s37
+; GCN-HSA-NEXT:    s_and_b32 s55, s2, s37
+; GCN-HSA-NEXT:    s_and_b32 s56, s5, s37
+; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s9, s9, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s8, s8, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s11, s11, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s10, s10, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s12, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-HSA-NEXT:    s_and_b32 s18, s1, s37
+; GCN-HSA-NEXT:    s_and_b32 s19, s0, s37
+; GCN-HSA-NEXT:    s_and_b32 s37, s14, s37
 ; GCN-HSA-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s17, s17, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s16, s16, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s19, s19, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s18, s18, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s5, s5, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s67, s1, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s3, s3, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s4, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s7, s7, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s6, s6, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s8, s8, 16
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s67
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s66
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s17
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xd0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0xa0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x90
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s65
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x80
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s59
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s58
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s7
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x60
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s66
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s59
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s63
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s62
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s61
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s9
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s57
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s56
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s68
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s18
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s67
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s51
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s36
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s52
@@ -3222,224 +3224,226 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 64
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s49
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s30
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 48
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s47
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s46
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s28
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 32
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s45
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s44
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s26
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s16, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s43
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s42
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s24
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s41
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s40
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s39
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s20, 0xffff
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x40
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x40
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s19, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s51, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s53, s50, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s55, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s57, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s59, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s62, s13, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s63, s12, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s15, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s14, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s17, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s16, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s37, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s37, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s36, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s36, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s39, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s39, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s38, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s38, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s41, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s41, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s40, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s40, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s43, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s43, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s42, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s37, s42, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s45, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s39, s45, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s44, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s41, s44, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s47, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s43, s47, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s46, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s45, s46, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s49, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s47, s49, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s49, s48, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s48, s48, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s51, s51, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s70, s15, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s61, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s63, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s64, s9, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s65, s8, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s66, s11, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s67, s10, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s68, s13, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s69, s12, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s37, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s37, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s36, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s22, s36, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s39, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s24, s39, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s38, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s26, s38, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s41, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s28, s41, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s40, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s30, s40, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s43, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s33, s43, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s42, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s42, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s45, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s37, s45, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s44, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s39, s44, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s40, s47, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s41, s47, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s46, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s43, s46, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s49, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s45, s49, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s46, s48, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s47, s48, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s51, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s49, s51, s20
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s51, s50, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s50, s50, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s53, s1, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s55, s0, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s57, s3, s20
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s59, s2, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, s20
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, s20
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s18, s20
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s18, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s17, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s16, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s20, s14, s20
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s67
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s52, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s2, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s13, s13, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s65
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s63
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s11, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s69
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s9, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s67
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s65
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s62
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s59
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s60
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s59
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s56
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s55
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s52
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s48
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s47
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s47
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s44
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s45
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s43
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s43
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s40
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s39
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s39
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s36
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s37
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s35
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s31
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s33
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s23
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -4015,163 +4019,166 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspa
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x40
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x40
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[36:51], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s18, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s19, s19
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s18, s18
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s65, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s66, s16, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s17, s17
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s16, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s64, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s69, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s70, s14, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s15, s15
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s14, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s62, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s67, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s68, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s18, s37, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s20, s37
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s22, s39, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s24, s39
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s26, s41, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s28, s41
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s30, s43, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s33, s43
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s35, s45, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s37, s45
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s39, s47, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s41, s47
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s43, s49, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s45, s49
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s47, s51, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s49, s51
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s51, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s53, s1
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s54, s0
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s55, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s57, s3
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s56, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s58, s2
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s13, s13
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s60, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s65, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s66, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s11, s11
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s10, s10
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s68
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s57, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s58, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s63, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s64, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s9, s9
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s8, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s66
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s55, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s56, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s61, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s62, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s7, s7
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s6, s6
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s53, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s54, s4, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s59, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s60, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s4, s4
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s62
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s19, s36, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s21, s36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s57
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s52, s50, 16
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s20, s37, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s22, s37
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s24, s39, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s26, s39
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s28, s41, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s30, s41
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s33, s43, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s35, s43
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s37, s45, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s39, s45
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s41, s47, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s43, s47
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s45, s49, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s47, s49
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s49, s51, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s51, s51
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s23, s38, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s25, s38
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s27, s40, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s29, s40
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s31, s42, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s34, s42
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s36, s44, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s38, s44
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s40, s46, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s42, s46
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s44, s48, 16
+; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s46, s48
+; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s48, s50, 16
 ; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s50, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s21, s36, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s23, s36
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s25, s38, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s27, s38
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s29, s40, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s31, s40
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s34, s42, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s36, s42
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s38, s44, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s40, s44
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s42, s46, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s44, s46
-; GCN-NOHSA-VI-NEXT:    s_ashr_i32 s46, s48, 16
-; GCN-NOHSA-VI-NEXT:    s_sext_i32_i16 s48, s48
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s52
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s51
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s47
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s42
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s43
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s40
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s41
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s38
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s39
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -6644,101 +6651,103 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspa
 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
 ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-NOHSA-SI:       ; %bb.0:
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x0
+; GCN-NOHSA-SI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, s23
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s4, s21
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s6, s19
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s17
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s15
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s28, s13
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s11
-; GCN-NOHSA-SI-NEXT:    s_mov_b32 s34, s9
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s22, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s20, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s18, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s16, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s14, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s12, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s10, 16
-; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s8, 16
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[54:55], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[56:57], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[58:59], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[64:65], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[66:67], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s18, s15
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s20, s13
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s22, s11
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s24, s9
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s26, s7
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s28, s5
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s30, s3
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s34, s1
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s36, s14, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s38, s12, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s40, s10, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s42, s8, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[44:45], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s46, s6, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s48, s4, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s50, s2, 16
+; GCN-NOHSA-SI-NEXT:    s_lshr_b32 s52, s0, 16
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[54:55], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[56:57], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[58:59], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[60:61], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[62:63], s[10:11], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[64:65], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[66:67], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[68:69], s[0:1], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[6:7], s[6:7], 48
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[8:9], s[8:9], 48
 ; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[10:11], s[10:11], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[14:15], s[14:15], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[16:17], s[16:17], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[18:19], s[18:19], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[20:21], s[20:21], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[22:23], s[22:23], 48
-; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s44
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s45
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s21
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[2:3], s[12:13], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[12:13], s[14:15], 48
+; GCN-NOHSA-SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 48
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s3
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[28:29], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[26:27], s[34:35], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[28:29], s[30:31], 0x100000
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s7
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s17
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s23
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s21
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s13
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[28:29], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[34:35], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[30:31], 0x100000
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[52:53], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[50:51], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[48:49], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[46:47], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[18:19], s[42:43], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[40:41], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[22:23], s[38:39], 0x100000
-; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[24:25], s[36:37], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[4:5], s[52:53], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[6:7], s[50:51], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[8:9], s[48:49], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[10:11], s[46:47], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[12:13], s[42:43], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[14:15], s[40:41], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[16:17], s[38:39], 0x100000
+; GCN-NOHSA-SI-NEXT:    s_bfe_i64 s[20:21], s[36:37], 0x100000
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(5)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s29
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s70
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s71
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s27
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s68
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s69
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s66
@@ -6755,382 +6764,390 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspa
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, s57
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, s54
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, s55
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s21
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s22
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, s16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, s17
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s20
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, s15
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s18
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s19
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, s12
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, s13
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s17
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, s11
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s14
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s15
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, s8
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, s9
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s12
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s13
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, s6
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
-; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x0
+; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT:    s_mov_b32 s8, s51
-; GCN-HSA-NEXT:    s_mov_b32 s34, s49
-; GCN-HSA-NEXT:    s_mov_b32 s52, s47
-; GCN-HSA-NEXT:    s_mov_b32 s54, s45
-; GCN-HSA-NEXT:    s_mov_b32 s56, s43
-; GCN-HSA-NEXT:    s_mov_b32 s58, s41
-; GCN-HSA-NEXT:    s_mov_b32 s60, s39
-; GCN-HSA-NEXT:    s_mov_b32 s62, s37
-; GCN-HSA-NEXT:    s_lshr_b32 s30, s46, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s24, s44, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s20, s42, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s16, s40, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s12, s38, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-HSA-NEXT:    s_lshr_b32 s64, s50, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s66, s48, 16
-; GCN-HSA-NEXT:    s_lshr_b32 s68, s36, 16
-; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[36:37], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[28:29], s[36:37], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[38:39], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[38:39], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[40:41], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[38:39], s[40:41], 48
-; GCN-HSA-NEXT:    s_ashr_i64 s[40:41], s[42:43], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[42:43], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[42:43], s[44:45], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[44:45], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[44:45], s[46:47], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[46:47], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[46:47], s[48:49], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[48:49], 0x100000
-; GCN-HSA-NEXT:    s_ashr_i64 s[48:49], s[50:51], 48
-; GCN-HSA-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
+; GCN-HSA-NEXT:    s_mov_b32 s38, s15
+; GCN-HSA-NEXT:    s_mov_b32 s40, s13
+; GCN-HSA-NEXT:    s_mov_b32 s42, s11
+; GCN-HSA-NEXT:    s_mov_b32 s44, s9
+; GCN-HSA-NEXT:    s_mov_b32 s46, s7
+; GCN-HSA-NEXT:    s_mov_b32 s48, s5
+; GCN-HSA-NEXT:    s_mov_b32 s50, s3
+; GCN-HSA-NEXT:    s_mov_b32 s52, s1
+; GCN-HSA-NEXT:    s_lshr_b32 s54, s14, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s56, s12, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s58, s10, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s60, s8, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s62, s6, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s64, s4, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s66, s2, 16
+; GCN-HSA-NEXT:    s_lshr_b32 s68, s0, 16
+; GCN-HSA-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[0:1], s[14:15], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[70:71], s[2:3], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[38:39], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
+; GCN-HSA-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[28:29], s[10:11], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[12:13], s[12:13], 48
+; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-HSA-NEXT:    s_ashr_i64 s[72:73], s[4:5], 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT:    s_bfe_i64 s[0:1], s[68:69], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[2:3], s[66:67], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[4:5], s[64:65], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[6:7], s[62:63], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[60:61], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[10:11], s[58:59], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[14:15], s[56:57], 0x100000
+; GCN-HSA-NEXT:    s_bfe_i64 s[38:39], s[54:55], 0x100000
 ; GCN-HSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[26:27], s[50:51], 0x100000
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s49
-; GCN-HSA-NEXT:    s_bfe_i64 s[8:9], s[68:69], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[48:49], s[66:67], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[64:65], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x100000
-; GCN-HSA-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x100000
-; GCN-HSA-NEXT:    s_add_u32 s64, s0, 0xf0
-; GCN-HSA-NEXT:    s_addc_u32 s65, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s34
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s35
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 0x90
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 0x70
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s46
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s47
+; GCN-HSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
+; GCN-HSA-NEXT:    s_add_u32 s54, s16, 0xf0
+; GCN-HSA-NEXT:    s_addc_u32 s55, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xd0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s13
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x90
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x70
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v31, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v30, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0x50
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s40
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s41
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s34
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s35
-; GCN-HSA-NEXT:    s_add_u32 s34, s0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s35, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s52
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s53
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s44
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 16
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xe0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s46
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s47
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s74
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s75
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s54
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-HSA-NEXT:    s_add_u32 s12, s16, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s55
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s42
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s43
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s78
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s79
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s44
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s45
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s76
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s77
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s49
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s72
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s73
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s71
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s26
-; GCN-HSA-NEXT:    s_add_u32 s26, s0, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s27
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s54
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s55
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s42
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s43
-; GCN-HSA-NEXT:    s_addc_u32 s27, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s65
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s56
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s57
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s40
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s41
-; GCN-HSA-NEXT:    s_add_u32 s22, s0, 0xc0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s34
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[30:31], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s23
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s26
-; GCN-HSA-NEXT:    s_addc_u32 s23, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s22
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s58
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s59
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s38
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s39
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s61
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s36
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s37
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s62
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s63
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s35
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s29
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s51
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s52
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s53
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s36
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s37
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s34
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s35
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s30
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s38
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s39
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s31
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s48
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s49
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s23
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s12
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[4:7]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
-; GCN-HSA-NEXT:    s_add_u32 s18, s0, 0xa0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-HSA-NEXT:    s_addc_u32 s19, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s30
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x80
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s14
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s15
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0x60
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT:    s_add_u32 s10, s16, 0xa0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-HSA-NEXT:    s_addc_u32 s11, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-HSA-NEXT:    s_add_u32 s6, s0, 64
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-HSA-NEXT:    s_add_u32 s8, s16, 0x80
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-HSA-NEXT:    s_addc_u32 s9, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s26
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-HSA-NEXT:    s_add_u32 s6, s16, 0x60
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s17
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_nop 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT:    s_add_u32 s4, s16, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s16, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    s_addc_u32 s3, s17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-NOHSA-VI:       ; %bb.0:
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx4 s[16:19], s[0:1], 0x24
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x0
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, -1
+; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s66, s27
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s27, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[68:69], s[68:69], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[26:27], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s26, 16
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s60, s25
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s25, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s66
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s67
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s68
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s69
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[24:25], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s24, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s26, s1
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s34, s3
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s36, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, s5
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s42, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s46, s7
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s48, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s52, s9
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s54, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s58, s11
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s64, s13
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s70, s15
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s72, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[68:69], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[24:25], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[28:29], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[34:35], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[38:39], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[42:43], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[48:49], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[54:55], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[60:61], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[66:67], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[72:73], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s17
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[26:27], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[30:31], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[36:37], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[40:41], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[46:47], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[52:53], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[58:59], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[64:65], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[70:71], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s65
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s54, s23
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s56, s23, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s60
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s62
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s63
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[22:23], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s22, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s67
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s68
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s69
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s58
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s59
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s48, s21
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s50, s21, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s56
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s57
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[20:21], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s20, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s61
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s52
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s42, s19
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s44, s19, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s51
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[18:19], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s18, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s55
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s21
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s36, s17
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s38, s17, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s43
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s45
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[16:17], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s16, s16, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s49
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s30, s15
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s38
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[2:3], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s12, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[0:1], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s12, s13
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s29
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s31
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s25
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s18
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v32i16_to_v32i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index eff979fad141f..104f0335e34a8 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -161,68 +161,68 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[12:13], s[14:15]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
-; GCN-IR-NEXT:    s_min_u32 s12, s12, s13
-; GCN-IR-NEXT:    s_min_u32 s16, s14, s15
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_min_u32 s14, s12, s13
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s10
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
+; GCN-IR-NEXT:    s_min_u32 s16, s12, s13
+; GCN-IR-NEXT:    s_sub_u32 s12, s14, s16
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[12:13], 63
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[12:13], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[22:23], s[18:19], -1
 ; GCN-IR-NEXT:    s_and_b64 s[20:21], s[22:23], s[20:21]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-IR-NEXT:    s_add_u32 s18, s12, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_addc_u32 s19, s13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[10:11], s14
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[10:11], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
+; GCN-IR-NEXT:    s_add_u32 s20, s6, -1
+; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
+; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
+; GCN-IR-NEXT:    s_add_u32 s10, s8, s16
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
+; GCN-IR-NEXT:    s_mov_b32 s17, s15
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[18:19], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GCN-IR-NEXT:    s_sub_u32 s8, s20, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s21, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s18, s16, s18
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_subb_u32 s19, s17, s19
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
 ; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
@@ -396,73 +396,73 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v13, v0, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v0, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v10
-; GCN-IR-NEXT:    v_min_u32_e32 v14, v0, v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v13, v14
+; GCN-IR-NEXT:    v_min_u32_e32 v0, v0, v7
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v9
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v10
+; GCN-IR-NEXT:    v_min_u32_e32 v13, v7, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v0, v13
 ; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v5
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v12, v10, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v18
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v9, 0, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v11, v9, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v7
-; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v0, s[4:5], 63, v7
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8]
+; GCN-IR-NEXT:    v_add_i32_e32 v15, vcc, 1, v7
+; GCN-IR-NEXT:    v_addc_u32_e32 v16, vcc, 0, v8, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8]
+; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v0
+; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[9:10], v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
-; GCN-IR-NEXT:    v_lshr_b64 v[16:17], v[9:10], v16
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v10, v13
-; GCN-IR-NEXT:    v_not_b32_e32 v11, v18
-; GCN-IR-NEXT:    v_add_i32_e32 v13, vcc, v10, v14
+; GCN-IR-NEXT:    v_add_i32_e32 v19, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v20, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v0, v0
+; GCN-IR-NEXT:    v_lshr_b64 v[15:16], v[9:10], v15
+; GCN-IR-NEXT:    v_not_b32_e32 v10, v17
+; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, v0, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v14, vcc, v11, v15, vcc
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v10, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v16, v10
+; GCN-IR-NEXT:    v_lshl_b64 v[13:14], v[15:16], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 31, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v0, v13, v0
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v0, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v9, v17, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v18, v7
-; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, 1, v13
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v19, v8
-; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, 0, v14, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[18:19], v[13:14]
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v18
+; GCN-IR-NEXT:    v_sub_i32_e32 v11, vcc, v19, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, v20, v14, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v17, v7
+; GCN-IR-NEXT:    v_add_i32_e32 v17, vcc, 1, v9
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v18, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v18, vcc, 0, v10, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10]
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v17
 ; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_and_b32_e32 v11, 1, v15
-; GCN-IR-NEXT:    v_and_b32_e32 v20, v15, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v15, v2
-; GCN-IR-NEXT:    v_sub_i32_e64 v16, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v19
-; GCN-IR-NEXT:    v_mov_b32_e32 v19, v12
-; GCN-IR-NEXT:    v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5]
+; GCN-IR-NEXT:    v_and_b32_e32 v11, 1, v13
+; GCN-IR-NEXT:    v_and_b32_e32 v16, v13, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v2
+; GCN-IR-NEXT:    v_sub_i32_e64 v15, s[4:5], v0, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v18
+; GCN-IR-NEXT:    v_mov_b32_e32 v18, v12
+; GCN-IR-NEXT:    v_subb_u32_e64 v16, s[4:5], v14, v16, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v18, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, v11
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -471,15 +471,15 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[7:8], 1
 ; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v0, v11, v2
+; GCN-IR-NEXT:    v_or_b32_e32 v11, v11, v2
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_xor_b32_e32 v2, v5, v4
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-IR-NEXT:    v_xor_b32_e32 v0, v5, v4
+; GCN-IR-NEXT:    v_xor_b32_e32 v3, v11, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GCN-IR-NEXT:    v_xor_b32_e32 v3, v12, v1
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-IR-NEXT:    v_xor_b32_e32 v2, v12, v1
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
+; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, %y
   ret i64 %result
@@ -1022,68 +1022,68 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[12:13], s[14:15]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
-; GCN-IR-NEXT:    s_min_u32 s12, s12, s13
-; GCN-IR-NEXT:    s_min_u32 s16, s14, s15
-; GCN-IR-NEXT:    s_sub_u32 s14, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[14:15], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
+; GCN-IR-NEXT:    s_min_u32 s14, s12, s13
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s10
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
+; GCN-IR-NEXT:    s_min_u32 s16, s12, s13
+; GCN-IR-NEXT:    s_sub_u32 s12, s14, s16
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[20:21], s[12:13], 63
+; GCN-IR-NEXT:    s_mov_b32 s15, 0
 ; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[20:21]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[14:15], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[20:21], s[12:13], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[22:23], s[18:19], -1
 ; GCN-IR-NEXT:    s_and_b64 s[20:21], s[22:23], s[20:21]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[20:21]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s14, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-IR-NEXT:    s_addc_u32 s19, s15, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-IR-NEXT:    s_add_u32 s18, s12, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_addc_u32 s19, s13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[10:11], s14
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[10:11], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
+; GCN-IR-NEXT:    s_add_u32 s20, s6, -1
+; GCN-IR-NEXT:    s_addc_u32 s21, s7, -1
+; GCN-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
+; GCN-IR-NEXT:    s_add_u32 s10, s8, s16
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, s15
+; GCN-IR-NEXT:    s_mov_b32 s17, s15
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[18:19], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GCN-IR-NEXT:    s_sub_u32 s8, s20, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s21, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s18, s16, s18
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_subb_u32 s19, s17, s19
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_3
 ; GCN-IR-NEXT:  BB9_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
@@ -1242,61 +1242,61 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s6, s7
-; GCN-IR-NEXT:    s_add_u32 s8, s10, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GCN-IR-NEXT:    s_min_u32 s8, s6, s7
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[10:11], 63
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[14:15], s[8:9], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[14:15], s[10:11], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[16:17], s[12:13], -1
 ; GCN-IR-NEXT:    s_and_b64 s[14:15], s[16:17], s[14:15]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[14:15]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_addc_u32 s13, s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s9, 63, s10
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], 24, s8
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s9
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], 24, s14
-; GCN-IR-NEXT:    s_add_u32 s8, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s10, 58, s10
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
+; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
+; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
+; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
+; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s6, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s6, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_lshr_b32 s6, s11, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s6, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s6, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s12, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s13, s12
+; GCN-IR-NEXT:    s_and_b32 s6, s12, 1
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s18
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s19
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
+; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_3
 ; GCN-IR-NEXT:  BB10_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
@@ -1442,26 +1442,26 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v10
+; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, s6, v8
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v9
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5]
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5]
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], 24, v4
@@ -1471,38 +1471,38 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], 24, v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, 58, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v11, vcc, 0, v11, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v14, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v14, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v14
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v16, v12, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v17
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
-; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v17
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -1678,39 +1678,39 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[12:13], s[4:5], v10
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v8
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v10, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v11, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v14, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v14, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v7, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[6:7]
-; GCN-IR-NEXT:    v_mov_b32_e32 v6, v14
+; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v17, v12, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v16, v12, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v12
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v17
-; GCN-IR-NEXT:    v_mov_b32_e32 v7, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v9
-; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v17
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v9
+; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v8
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index fd75313c6ec77..2fb3b77d0e414 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -18,15 +18,15 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a,
 ; SI-NEXT:    s_cmp_lg_u32 s8, 0
 ; SI-NEXT:    s_cbranch_scc0 BB0_2
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_add_i32 s0, s11, s0
+; SI-NEXT:    s_add_i32 s2, s11, s0
 ; SI-NEXT:    s_cbranch_execz BB0_3
 ; SI-NEXT:    s_branch BB0_4
 ; SI-NEXT:  BB0_2:
-; SI-NEXT:    ; implicit-def: $sgpr0
+; SI-NEXT:    ; implicit-def: $sgpr2
 ; SI-NEXT:  BB0_3: ; %if
-; SI-NEXT:    s_sub_i32 s0, s9, s10
+; SI-NEXT:    s_sub_i32 s2, s9, s10
 ; SI-NEXT:  BB0_4: ; %endif
-; SI-NEXT:    s_add_i32 s0, s0, s8
+; SI-NEXT:    s_add_i32 s0, s2, s8
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -55,27 +55,27 @@ endif:
 define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
 ; SI-LABEL: sgpr_if_else_salu_br_opt:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dword s2, s[0:1], 0x13
+; SI-NEXT:    s_load_dword s6, s[0:1], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_cmp_lg_u32 s2, 0
+; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_cbranch_scc0 BB1_2
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_load_dword s3, s[0:1], 0x2e
-; SI-NEXT:    s_load_dword s6, s[0:1], 0x37
+; SI-NEXT:    s_load_dword s2, s[0:1], 0x2e
+; SI-NEXT:    s_load_dword s3, s[0:1], 0x37
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_i32 s3, s3, s6
+; SI-NEXT:    s_add_i32 s7, s2, s3
 ; SI-NEXT:    s_cbranch_execz BB1_3
 ; SI-NEXT:    s_branch BB1_4
 ; SI-NEXT:  BB1_2:
-; SI-NEXT:    ; implicit-def: $sgpr3
+; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:  BB1_3: ; %if
-; SI-NEXT:    s_load_dword s3, s[0:1], 0x1c
+; SI-NEXT:    s_load_dword s2, s[0:1], 0x1c
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x25
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_i32 s3, s3, s0
+; SI-NEXT:    s_add_i32 s7, s2, s0
 ; SI-NEXT:  BB1_4: ; %endif
-; SI-NEXT:    s_add_i32 s0, s3, s2
+; SI-NEXT:    s_add_i32 s0, s7, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -109,18 +109,18 @@ define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xc
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    v_cmp_lg_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; SI-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; SI-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; SI-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; SI-NEXT:    s_cbranch_execz BB2_2
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_i32 s6, s2, s3
+; SI-NEXT:    s_add_i32 s8, s2, s3
 ; SI-NEXT:  BB2_2: ; %Flow
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
-; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_or_saveexec_b64 s[2:3], s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    s_xor_b64 exec, exec, s[2:3]
 ; SI-NEXT:  ; %bb.3: ; %if
 ; SI-NEXT:    s_add_i32 s0, s0, s1
@@ -155,46 +155,45 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out,
 ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    ; implicit-def: $sgpr0_sgpr1
-; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; SI-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; SI-NEXT:    s_xor_b64 s[10:11], exec, s[10:11]
 ; SI-NEXT:    s_cbranch_execz BB3_2
 ; SI-NEXT:  ; %bb.1: ; %else
-; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
-; SI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; SI-NEXT:    s_and_b64 s[8:9], vcc, exec
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:  BB3_2: ; %Flow
-; SI-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
-; SI-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[0:1], s[10:11]
+; SI-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execz BB3_4
 ; SI-NEXT:  ; %bb.3: ; %if
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s10, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT:    s_mov_b32 s15, 0xf000
+; SI-NEXT:    s_mov_b32 s14, 0
+; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], exec
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
-; SI-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; SI-NEXT:    s_or_b64 s[8:9], s[2:3], s[6:7]
 ; SI-NEXT:  BB3_4: ; %endif
-; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[8:9]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 23a211b8d723e..7a04bb8c9a1b6 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -166,8 +166,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s8, s[0:1], 0xc
-; SI-NEXT:    s_brev_b32 s9, 44
+; SI-NEXT:    s_load_dword s14, s[0:1], 0xc
+; SI-NEXT:    s_brev_b32 s8, 44
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
@@ -176,53 +176,53 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
 ; SI-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
+; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s8
 ; SI-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, 3
 ; SI-NEXT:    s_branch BB3_4
 ; SI-NEXT:  BB3_1: ; %Flow6
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_mov_b64 s[10:11], 0
+; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  BB3_2: ; %Flow5
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_mov_b64 s[14:15], 0
+; SI-NEXT:    s_mov_b64 s[12:13], 0
 ; SI-NEXT:  BB3_3: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
 ; SI-NEXT:    s_cbranch_vccnz BB3_8
 ; SI-NEXT:  BB3_4: ; %while.cond
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_mov_b64 s[14:15], -1
-; SI-NEXT:    s_mov_b64 s[10:11], -1
 ; SI-NEXT:    s_mov_b64 s[12:13], -1
+; SI-NEXT:    s_mov_b64 s[8:9], -1
+; SI-NEXT:    s_mov_b64 s[10:11], -1
 ; SI-NEXT:    s_mov_b64 vcc, s[0:1]
 ; SI-NEXT:    s_cbranch_vccz BB3_3
 ; SI-NEXT:  ; %bb.5: ; %convex.exit
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
+; SI-NEXT:    s_mov_b64 s[8:9], -1
 ; SI-NEXT:    s_mov_b64 s[10:11], -1
-; SI-NEXT:    s_mov_b64 s[12:13], -1
 ; SI-NEXT:    s_mov_b64 vcc, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz BB3_2
 ; SI-NEXT:  ; %bb.6: ; %if.end
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_mov_b64 s[12:13], -1
+; SI-NEXT:    s_mov_b64 s[10:11], -1
 ; SI-NEXT:    s_mov_b64 vcc, s[4:5]
 ; SI-NEXT:    s_cbranch_vccz BB3_1
 ; SI-NEXT:  ; %bb.7: ; %if.else
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_mov_b64 s[12:13], 0
+; SI-NEXT:    s_mov_b64 s[10:11], 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_branch BB3_1
 ; SI-NEXT:  BB3_8: ; %loop.exit.guard4
 ; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
+; SI-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_vccz BB3_4
 ; SI-NEXT:  ; %bb.9: ; %loop.exit.guard
-; SI-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; SI-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; SI-NEXT:    s_cbranch_vccz BB3_13
 ; SI-NEXT:  ; %bb.10: ; %for.cond.preheader
-; SI-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; SI-NEXT:    s_cmpk_lt_i32 s14, 0x3e8
 ; SI-NEXT:    s_cbranch_scc0 BB3_13
 ; SI-NEXT:  ; %bb.11: ; %for.body
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
@@ -239,8 +239,8 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    s_mov_b32 s6, -1
 ; FLAT-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; FLAT-NEXT:    s_load_dword s8, s[0:1], 0x30
-; FLAT-NEXT:    s_brev_b32 s9, 44
+; FLAT-NEXT:    s_load_dword s14, s[0:1], 0x30
+; FLAT-NEXT:    s_brev_b32 s8, 44
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
 ; FLAT-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
@@ -249,53 +249,53 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32
 ; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
 ; FLAT-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
-; FLAT-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
+; FLAT-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s8
 ; FLAT-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; FLAT-NEXT:    v_mov_b32_e32 v0, 3
 ; FLAT-NEXT:    s_branch BB3_4
 ; FLAT-NEXT:  BB3_1: ; %Flow6
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_mov_b64 s[10:11], 0
+; FLAT-NEXT:    s_mov_b64 s[8:9], 0
 ; FLAT-NEXT:  BB3_2: ; %Flow5
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_mov_b64 s[14:15], 0
+; FLAT-NEXT:    s_mov_b64 s[12:13], 0
 ; FLAT-NEXT:  BB3_3: ; %Flow
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; FLAT-NEXT:    s_and_b64 vcc, exec, s[10:11]
 ; FLAT-NEXT:    s_cbranch_vccnz BB3_8
 ; FLAT-NEXT:  BB3_4: ; %while.cond
 ; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLAT-NEXT:    s_mov_b64 s[14:15], -1
-; FLAT-NEXT:    s_mov_b64 s[10:11], -1
 ; FLAT-NEXT:    s_mov_b64 s[12:13], -1
+; FLAT-NEXT:    s_mov_b64 s[8:9], -1
+; FLAT-NEXT:    s_mov_b64 s[10:11], -1
 ; FLAT-NEXT:    s_mov_b64 vcc, s[0:1]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_3
 ; FLAT-NEXT:  ; %bb.5: ; %convex.exit
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
+; FLAT-NEXT:    s_mov_b64 s[8:9], -1
 ; FLAT-NEXT:    s_mov_b64 s[10:11], -1
-; FLAT-NEXT:    s_mov_b64 s[12:13], -1
 ; FLAT-NEXT:    s_mov_b64 vcc, s[2:3]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_2
 ; FLAT-NEXT:  ; %bb.6: ; %if.end
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_mov_b64 s[12:13], -1
+; FLAT-NEXT:    s_mov_b64 s[10:11], -1
 ; FLAT-NEXT:    s_mov_b64 vcc, s[4:5]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_1
 ; FLAT-NEXT:  ; %bb.7: ; %if.else
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_mov_b64 s[12:13], 0
+; FLAT-NEXT:    s_mov_b64 s[10:11], 0
 ; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    s_branch BB3_1
 ; FLAT-NEXT:  BB3_8: ; %loop.exit.guard4
 ; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
-; FLAT-NEXT:    s_and_b64 vcc, exec, s[10:11]
+; FLAT-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_4
 ; FLAT-NEXT:  ; %bb.9: ; %loop.exit.guard
-; FLAT-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; FLAT-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; FLAT-NEXT:    s_cbranch_vccz BB3_13
 ; FLAT-NEXT:  ; %bb.10: ; %for.cond.preheader
-; FLAT-NEXT:    s_cmpk_lt_i32 s8, 0x3e8
+; FLAT-NEXT:    s_cmpk_lt_i32 s14, 0x3e8
 ; FLAT-NEXT:    s_cbranch_scc0 BB3_13
 ; FLAT-NEXT:  ; %bb.11: ; %for.body
 ; FLAT-NEXT:    s_and_b64 vcc, exec, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index aec41958fe0a7..a893379635e14 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -629,11 +629,11 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
 ; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz BB10_4
 ; SI-NEXT:  ; %bb.1: ; %bb.preheader
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:  BB10_2: ; %bb
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    ;;#ASMSTART
@@ -655,13 +655,13 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
 ; SI-NEXT:  ; %bb.3: ; %bb
 ; SI-NEXT:    ; in Loop: Header=BB10_2 Depth=1
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
-; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_b64 vcc, exec, vcc
 ; SI-NEXT:    s_cbranch_vccnz BB10_2
 ; SI-NEXT:  BB10_4: ; %Flow1
-; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 8
@@ -1285,23 +1285,23 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; SI-NEXT:    s_cbranch_scc1 BB15_7
 ; SI-NEXT:  ; %bb.1: ; %.lr.ph
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
-; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:    s_branch BB15_3
 ; SI-NEXT:  BB15_2: ; %latch
 ; SI-NEXT:    ; in Loop: Header=BB15_3 Depth=1
-; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SI-NEXT:    s_add_i32 s4, s4, 1
-; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s4, v1
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_add_i32 s6, s6, 1
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
 ; SI-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
-; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    v_mov_b32_e32 v2, s6
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execz BB15_6
 ; SI-NEXT:  BB15_3: ; %hdr
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; SI-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; SI-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; SI-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz BB15_2
 ; SI-NEXT:  ; %bb.4: ; %kill
 ; SI-NEXT:    ; in Loop: Header=BB15_3 Depth=1
@@ -1328,23 +1328,23 @@ define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) {
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc1 BB15_7
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %.lr.ph
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX10-WAVE64-NEXT:    s_mov_b32 s4, 0
+; GFX10-WAVE64-NEXT:    s_mov_b32 s6, 0
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX10-WAVE64-NEXT:    s_branch BB15_3
 ; GFX10-WAVE64-NEXT:  BB15_2: ; %latch
 ; GFX10-WAVE64-NEXT:    ; in Loop: Header=BB15_3 Depth=1
-; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX10-WAVE64-NEXT:    s_add_i32 s4, s4, 1
-; GFX10-WAVE64-NEXT:    v_cmp_ge_i32_e32 vcc, s4, v1
-; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-WAVE64-NEXT:    s_add_i32 s6, s6, 1
+; GFX10-WAVE64-NEXT:    v_cmp_ge_i32_e32 vcc, s6, v1
+; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-WAVE64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; GFX10-WAVE64-NEXT:    s_cbranch_execz BB15_6
 ; GFX10-WAVE64-NEXT:  BB15_3: ; %hdr
 ; GFX10-WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-WAVE64-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX10-WAVE64-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX10-WAVE64-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX10-WAVE64-NEXT:    s_cbranch_execz BB15_2
 ; GFX10-WAVE64-NEXT:  ; %bb.4: ; %kill
 ; GFX10-WAVE64-NEXT:    ; in Loop: Header=BB15_3 Depth=1

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e0247b101f176..1bb7d8f1a4dd5 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -16,6 +16,7 @@
 
 ; GFX9-FLATSCR: s_mov_b32 [[SOFF1:s[0-9]+]], 4{{$}}
 ; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill
+; GFX9-FLATSCR: ;;#ASMSTART
 ; GFX9-FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x1{{[0-9a-f]+}}{{$}}
 ; GFX9-FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload
 

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index 2238976cba67d..6664820317d88 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -246,7 +246,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %
 ; GFX908-DAG: v_accvgpr_read_b32
 
 ; GCN:    NumVgprs: 256
-; GFX900: ScratchSize: 1796
+; GFX900: ScratchSize: 2052
 ; GFX908-FIXME: ScratchSize: 0
 ; GCN:    VGPRBlocks: 63
 ; GCN:    NumVGPRsForWavesPerEU: 256

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
index 317f240b6adf6..23714f0323ef3 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
@@ -14,42 +14,39 @@ body:             |
   ; RA:   successors: %bb.1(0x80000000)
   ; RA:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; RA:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; RA:   undef %5.sub1:sgpr_1024 = S_MOV_B32 -1
-  ; RA:   %5.sub0:sgpr_1024 = S_MOV_B32 -1
-  ; RA:   undef %4.sub0_sub1:sgpr_1024 = COPY %5.sub0_sub1
+  ; RA:   undef %2.sub1:sgpr_1024 = S_MOV_B32 -1
+  ; RA:   %2.sub0:sgpr_1024 = S_MOV_B32 -1
   ; RA:   undef %3.sub0:sgpr_1024 = S_MOV_B32 0
   ; RA: bb.1:
   ; RA:   successors: %bb.2(0x80000000)
-  ; RA:   undef %6.sub0_sub1:sgpr_1024 = COPY %4.sub0_sub1
-  ; RA:   %6.sub2:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub3:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub4:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub5:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub6:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub7:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub8:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub9:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub10:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub11:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub12:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub13:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub14:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub15:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub16:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub17:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub18:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub19:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub20:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub21:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub22:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub23:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub24:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub25:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub26:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub27:sgpr_1024 = COPY %6.sub1
-  ; RA:   %6.sub28:sgpr_1024 = COPY %6.sub0
-  ; RA:   %6.sub29:sgpr_1024 = COPY %6.sub1
-  ; RA:   undef %4.sub0_sub1:sgpr_1024 = COPY %6.sub0_sub1
+  ; RA:   %2.sub2:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub3:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub4:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub5:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub6:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub7:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub8:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub9:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub10:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub11:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub12:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub13:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub14:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub15:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub16:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub17:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub18:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub19:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub20:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub21:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub22:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub23:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub24:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub25:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub26:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub27:sgpr_1024 = COPY %2.sub1
+  ; RA:   %2.sub28:sgpr_1024 = COPY %2.sub0
+  ; RA:   %2.sub29:sgpr_1024 = COPY %2.sub1
   ; RA:   %3.sub1:sgpr_1024 = COPY %3.sub0
   ; RA:   %3.sub2:sgpr_1024 = COPY %3.sub0
   ; RA:   %3.sub3:sgpr_1024 = COPY %3.sub0
@@ -89,79 +86,77 @@ body:             |
   ; VR-LABEL: name: splitkit_copy_bundle
   ; VR: bb.0:
   ; VR:   successors: %bb.1(0x80000000)
-  ; VR:   renamable $sgpr69 = S_MOV_B32 -1
-  ; VR:   renamable $sgpr68 = S_MOV_B32 -1
-  ; VR:   renamable $sgpr36 = S_MOV_B32 0
+  ; VR:   renamable $sgpr37 = S_MOV_B32 -1
+  ; VR:   renamable $sgpr36 = S_MOV_B32 -1
+  ; VR:   renamable $sgpr68 = S_MOV_B32 0
   ; VR:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
-  ; VR:   renamable $sgpr70_sgpr71 = IMPLICIT_DEF
+  ; VR:   renamable $sgpr66_sgpr67 = IMPLICIT_DEF
   ; VR: bb.1:
   ; VR:   successors: %bb.2(0x80000000)
-  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71
-  ; VR:   renamable $sgpr40_sgpr41 = COPY killed renamable $sgpr68_sgpr69
-  ; VR:   renamable $sgpr42 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr43 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr44 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr45 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr46 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr47 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr48 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr49 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr50 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr51 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr52 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr53 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr54 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr55 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr56 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr57 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr58 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr59 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr60 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr61 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr62 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr63 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr64 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr65 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr66 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr67 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr68 = COPY renamable $sgpr40
-  ; VR:   renamable $sgpr69 = COPY renamable $sgpr41
-  ; VR:   renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr40_sgpr41
-  ; VR:   renamable $sgpr37 = COPY renamable $sgpr36
+  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67
   ; VR:   renamable $sgpr38 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr39 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr39 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr40 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr41 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr41 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr42 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr43 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr43 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr44 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr45 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr45 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr46 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr47 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr47 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr48 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr49 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr49 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr50 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr51 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr51 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr52 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr53 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr53 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr54 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr55 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr55 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr56 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr57 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr57 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr58 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr59 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr59 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr60 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr61 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr61 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr62 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr63 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr63 = COPY renamable $sgpr37
   ; VR:   renamable $sgpr64 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr65 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr66 = COPY renamable $sgpr36
-  ; VR:   renamable $sgpr67 = COPY renamable $sgpr36
+  ; VR:   renamable $sgpr65 = COPY renamable $sgpr37
+  ; VR:   renamable $sgpr69 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr70 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr71 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr72 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr73 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr74 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr75 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr76 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr77 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr78 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr79 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr80 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr81 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr82 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr83 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr84 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr85 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr86 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr87 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr88 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr89 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr90 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr91 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr92 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr93 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr94 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr95 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr96 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr97 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr98 = COPY renamable $sgpr68
+  ; VR:   renamable $sgpr99 = COPY renamable $sgpr68
   ; VR: bb.2:
   ; VR:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71
-  ; VR:   S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr70_sgpr71
+  ; VR:   liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67
+  ; VR:   S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr66_sgpr67
   ; VR:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; VR:   S_BRANCH %bb.2
   bb.0:
@@ -309,11 +304,11 @@ body:             |
     ; VR: renamable $sgpr9 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr13, 0 :: (dereferenceable invariant load (s32))
     ; VR: renamable $sgpr14 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr15, 0 :: (dereferenceable invariant load (s32))
     ; VR: renamable $sgpr10_sgpr11 = IMPLICIT_DEF
-    ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22,  0 :: (dereferenceable invariant load (s32))
-    ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16,  0 :: (dereferenceable invariant load (s32))
-    ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18,  0 :: (dereferenceable invariant load (s32))
-    ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19,  0 :: (dereferenceable invariant load (s32))
-    ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21,  0 :: (dereferenceable invariant load (s32))
+    ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0 :: (dereferenceable invariant load (s32))
+    ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0 :: (dereferenceable invariant load (s32))
+    ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0 :: (dereferenceable invariant load (s32))
+    ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0 :: (dereferenceable invariant load (s32))
+    ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0 :: (dereferenceable invariant load (s32))
     ; VR: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr10_sgpr11, implicit killed renamable $sgpr8, implicit killed renamable $sgpr9, implicit killed renamable $sgpr12, implicit killed renamable $sgpr13, implicit killed renamable $sgpr14, implicit killed renamable $sgpr15, implicit killed renamable $sgpr16, implicit killed renamable $sgpr17
     %0:sgpr_128 = IMPLICIT_DEF
     %1:sreg_64 = IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 95d575ec95d70..e36b7893c618b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -133,78 +133,78 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[12:13], -1
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
+; GCN-IR-NEXT:    s_min_u32 s8, s12, s8
+; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
+; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
 ; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s17, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[6:7], s8
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[6:7], s16
-; GCN-IR-NEXT:    s_add_u32 s8, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s2, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s3, s11
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
+; GCN-IR-NEXT:    s_add_u32 s16, s0, -1
+; GCN-IR-NEXT:    s_addc_u32 s17, s1, -1
+; GCN-IR-NEXT:    s_not_b64 s[2:3], s[8:9]
+; GCN-IR-NEXT:    s_mov_b32 s13, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s2, s12
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, s9
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s2, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s2, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s2, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s2, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s2, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
+; GCN-IR-NEXT:    s_mov_b32 s13, s12
+; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s18
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s19
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
+; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
 ; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB0_6
 ; GCN-IR-NEXT:  BB0_5:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
 ; GCN-IR-NEXT:  BB0_6: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s0, v0
@@ -372,72 +372,72 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
 ; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v6
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v3, v7
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
-; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v14, v3, v7
-; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v12, v14
+; GCN-IR-NEXT:    v_min_u32_e32 v3, v3, v7
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v7, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; GCN-IR-NEXT:    v_ffbh_u32_e32 v8, v1
+; GCN-IR-NEXT:    v_min_u32_e32 v12, v7, v8
+; GCN-IR-NEXT:    v_sub_i32_e32 v7, vcc, v3, v12
 ; GCN-IR-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[7:8]
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[4:5], 63, v[7:8]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[6:7], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v13
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v13, v11
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v10, v1, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v0, 0, s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v7
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v8, vcc
-; GCN-IR-NEXT:    v_sub_i32_e64 v3, s[4:5], 63, v7
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8]
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v7
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v8, vcc
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8]
+; GCN-IR-NEXT:    v_sub_i32_e64 v7, s[4:5], 63, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[0:1], v7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
-; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[0:1], v3
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, -1, v5
-; GCN-IR-NEXT:    v_lshr_b64 v[16:17], v[0:1], v9
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v6, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v10, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-IR-NEXT:    v_not_b32_e32 v11, v13
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, v10, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, v11, v15, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, -1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, -1, v6, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v3, v3
+; GCN-IR-NEXT:    v_not_b32_e32 v9, v11
+; GCN-IR-NEXT:    v_add_i32_e32 v11, vcc, v3, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v12, vcc, v9, v13, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[16:17], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v10, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v14, v14, v10
+; GCN-IR-NEXT:    v_lshl_b64 v[14:15], v[14:15], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v3, 31, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v14, v3
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v10, vcc, v3, v14
-; GCN-IR-NEXT:    v_subb_u32_e32 v10, vcc, v9, v15, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v18, v7
-; GCN-IR-NEXT:    v_add_i32_e32 v18, vcc, 1, v12
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v16, 31, v10
-; GCN-IR-NEXT:    v_or_b32_e32 v8, v19, v8
-; GCN-IR-NEXT:    v_addc_u32_e32 v19, vcc, 0, v13, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[18:19], v[12:13]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v18
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-IR-NEXT:    v_and_b32_e32 v10, 1, v16
-; GCN-IR-NEXT:    v_and_b32_e32 v17, v16, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v16, v16, v5
-; GCN-IR-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v16
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v19
-; GCN-IR-NEXT:    v_mov_b32_e32 v19, v11
-; GCN-IR-NEXT:    v_subb_u32_e64 v17, s[4:5], v15, v17, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e32 v9, vcc, v18, v3
+; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, v19, v15, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v16, v7
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v11
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v17, v8
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v12, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12]
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v16
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_and_b32_e32 v9, 1, v13
+; GCN-IR-NEXT:    v_and_b32_e32 v20, v13, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v13, v5
+; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v3, v13
+; GCN-IR-NEXT:    v_mov_b32_e32 v12, v17
+; GCN-IR-NEXT:    v_mov_b32_e32 v17, v10
+; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v18, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v16, v9
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -445,11 +445,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[7:8], v[7:8], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v9, v10, v7
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v8
+; GCN-IR-NEXT:    v_or_b32_e32 v9, v9, v7
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, v5, v3
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, v5, v10
 ; GCN-IR-NEXT:    v_mul_hi_u32 v7, v5, v9
 ; GCN-IR-NEXT:    v_mul_lo_u32 v6, v6, v9
 ; GCN-IR-NEXT:    v_mul_lo_u32 v5, v5, v9
@@ -1030,79 +1030,79 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[12:13]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s2
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_min_u32 s14, s12, s13
+; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[16:17], -1
 ; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s19, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[2:3], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
-; GCN-IR-NEXT:    s_not_b64 s[6:7], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s6, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[2:3], s16
+; GCN-IR-NEXT:    s_add_u32 s18, s8, -1
+; GCN-IR-NEXT:    s_addc_u32 s19, s9, -1
+; GCN-IR-NEXT:    s_not_b64 s[6:7], s[10:11]
+; GCN-IR-NEXT:    s_mov_b32 s15, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s6, s14
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, s11
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s6, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[6:7]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s6, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s6, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s6, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s6, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; GCN-IR-NEXT:    s_lshr_b32 s6, s13, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[6:7]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GCN-IR-NEXT:    s_sub_u32 s6, s18, s16
+; GCN-IR-NEXT:    s_subb_u32 s6, s19, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s6, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s6, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[20:21], s[14:15], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s16, s16, s20
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_subb_u32 s17, s17, s21
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_3
 ; GCN-IR-NEXT:  BB8_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    s_branch BB8_6
 ; GCN-IR-NEXT:  BB8_5:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[16:17]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[16:17]
 ; GCN-IR-NEXT:  BB8_6: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
@@ -1192,79 +1192,79 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[10:11], s[12:13]
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[12:13]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s2
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s16, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s12, s16
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s13, 0
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[10:11], 63
-; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[14:15], -1
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s3
+; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
+; GCN-IR-NEXT:    s_min_u32 s14, s12, s13
+; GCN-IR-NEXT:    s_sub_u32 s12, s10, s14
+; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[18:19], s[12:13], 63
+; GCN-IR-NEXT:    s_xor_b64 s[20:21], s[16:17], -1
 ; GCN-IR-NEXT:    s_and_b64 s[18:19], s[20:21], s[18:19]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s18, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s19, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT:    s_add_u32 s16, s12, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-IR-NEXT:    s_addc_u32 s17, s13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[2:3], s10
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[18:19], s[2:3], s18
-; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
-; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
-; GCN-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
-; GCN-IR-NEXT:    s_mov_b32 s17, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s8, s16
-; GCN-IR-NEXT:    s_addc_u32 s13, s9, s13
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[2:3], s16
+; GCN-IR-NEXT:    s_add_u32 s18, s6, -1
+; GCN-IR-NEXT:    s_addc_u32 s19, s7, -1
+; GCN-IR-NEXT:    s_not_b64 s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_mov_b32 s15, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s8, s14
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, s11
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
 ; GCN-IR-NEXT:    s_mov_b32 s9, 0
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s8, s15, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[18:19], s[18:19], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s18
-; GCN-IR-NEXT:    s_subb_u32 s8, s11, s19
-; GCN-IR-NEXT:    s_ashr_i32 s16, s8, 31
-; GCN-IR-NEXT:    s_mov_b32 s17, s16
-; GCN-IR-NEXT:    s_and_b32 s8, s16, 1
-; GCN-IR-NEXT:    s_and_b64 s[20:21], s[16:17], s[6:7]
-; GCN-IR-NEXT:    s_sub_u32 s18, s18, s20
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-IR-NEXT:    s_subb_u32 s19, s19, s21
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s13
-; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
-; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_lshr_b32 s8, s13, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
+; GCN-IR-NEXT:    s_sub_u32 s8, s18, s16
+; GCN-IR-NEXT:    s_subb_u32 s8, s19, s17
+; GCN-IR-NEXT:    s_ashr_i32 s14, s8, 31
+; GCN-IR-NEXT:    s_mov_b32 s15, s14
+; GCN-IR-NEXT:    s_and_b32 s8, s14, 1
+; GCN-IR-NEXT:    s_and_b64 s[20:21], s[14:15], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s16, s16, s20
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_subb_u32 s17, s17, s21
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
+; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_3
 ; GCN-IR-NEXT:  BB9_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], 1
 ; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    s_branch BB9_6
 ; GCN-IR-NEXT:  BB9_5:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[16:17]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[16:17]
 ; GCN-IR-NEXT:  BB9_6: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
@@ -1416,61 +1416,61 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
-; GCN-IR-NEXT:    s_min_u32 s8, s2, s3
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
+; GCN-IR-NEXT:    s_min_u32 s6, s2, s3
+; GCN-IR-NEXT:    s_add_u32 s8, s6, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
 ; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s7, 63, s8
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s7
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
-; GCN-IR-NEXT:    s_add_u32 s6, s4, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s5, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s10
+; GCN-IR-NEXT:    s_add_u32 s14, s4, -1
+; GCN-IR-NEXT:    s_addc_u32 s15, s5, -1
+; GCN-IR-NEXT:    s_sub_u32 s6, 58, s6
+; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB10_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s2, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s2, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[4:5]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s2, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s2, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s2, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_and_b32 s2, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[10:11], s[4:5]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
+; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB10_3
 ; GCN-IR-NEXT:  BB10_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
@@ -1614,26 +1614,26 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GCN-IR-NEXT:    v_min_u32_e32 v6, v2, v3
 ; GCN-IR-NEXT:    s_movk_i32 s6, 0xffc5
-; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v8
+; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
 ; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, 24, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v7
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4]
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], 24, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
@@ -1642,38 +1642,38 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB11_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], 24, v6
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v1, vcc
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 58, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-IR-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 58, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], 24, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_subb_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-IR-NEXT:  BB11_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v4
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v6, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v7, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v4, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v7, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7]
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v5
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB11_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -1847,39 +1847,39 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB12_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB12_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index a5b70174c1460..f8140f9ba0143 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -135,68 +135,68 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
 ; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s8, s12, s8
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    s_min_u32 s10, s12, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_sub_u32 s8, s10, s12
+; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
 ; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
-; GCN-IR-NEXT:    s_add_u32 s6, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[8:9]
-; GCN-IR-NEXT:    s_mov_b32 s13, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s2, s12
-; GCN-IR-NEXT:    s_addc_u32 s9, s3, s9
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT:    s_add_u32 s16, s0, -1
+; GCN-IR-NEXT:    s_addc_u32 s17, s1, -1
+; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
+; GCN-IR-NEXT:    s_add_u32 s6, s2, s12
+; GCN-IR-NEXT:    s_addc_u32 s7, s3, s11
+; GCN-IR-NEXT:    s_mov_b32 s13, s11
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s2, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s2, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s2, s16, s12
+; GCN-IR-NEXT:    s_subb_u32 s2, s17, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s2, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_and_b32 s2, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[10:11], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s14, s12, s14
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_subb_u32 s15, s13, s15
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
+; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[2:3]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
 ; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[0:1], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -370,40 +370,40 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v2
 ; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
-; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
-; GCN-IR-NEXT:    v_not_b32_e32 v7, v9
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v7, v11, vcc
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v0, v8
+; GCN-IR-NEXT:    v_not_b32_e32 v1, v9
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v0, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v1, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
-; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v8
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v9, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v14, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v15, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v8, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v8, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v9, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, v8
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
 ; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v7
 ; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -833,68 +833,68 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    s_min_u32 s8, s8, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
-; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
-; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
-; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    s_min_u32 s10, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_sub_u32 s8, s10, s12
+; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
 ; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    s_add_u32 s14, s8, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_addc_u32 s15, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
-; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_not_b64 s[0:1], s[8:9]
-; GCN-IR-NEXT:    s_mov_b32 s13, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s0, s12
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, s9
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT:    s_add_u32 s16, s2, -1
+; GCN-IR-NEXT:    s_addc_u32 s17, s3, -1
+; GCN-IR-NEXT:    s_not_b64 s[0:1], s[10:11]
+; GCN-IR-NEXT:    s_add_u32 s6, s0, s12
+; GCN-IR-NEXT:    s_addc_u32 s7, s1, s11
+; GCN-IR-NEXT:    s_mov_b32 s13, s11
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s1, 0
 ; GCN-IR-NEXT:  BB7_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s0, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s0, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s0, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s0, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_lshr_b32 s0, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s0, s16, s12
+; GCN-IR-NEXT:    s_subb_u32 s0, s17, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s0, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_and_b32 s0, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[14:15], s[10:11], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s14, s12, s14
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_subb_u32 s15, s13, s15
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
+; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[0:1]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_3
 ; GCN-IR-NEXT:  BB7_4: ; %Flow3
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
@@ -1034,61 +1034,61 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
 ; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s4, s5
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
+; GCN-IR-NEXT:    s_min_u32 s6, s4, s5
+; GCN-IR-NEXT:    s_add_u32 s8, s6, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
 ; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s7, 63, s8
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s7
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
-; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s10
+; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
+; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
+; GCN-IR-NEXT:    s_sub_u32 s6, 58, s6
+; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s4, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s4, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s4, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_and_b32 s4, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[10:11], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
+; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB8_3
 ; GCN-IR-NEXT:  BB8_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
@@ -1245,39 +1245,39 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB9_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB9_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow

diff  --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 2582a874f4160..5215d12a491e3 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -202,14 +202,14 @@ define hidden void @blam() {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    flat_load_dword v41, v[1:2]
-; GCN-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-NEXT:    flat_load_dword v43, v[1:2]
+; GCN-NEXT:    v_mov_b32_e32 v42, 0
 ; GCN-NEXT:    s_getpc_b64 s[36:37]
 ; GCN-NEXT:    s_add_u32 s36, s36, spam at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s37, s37, spam at rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[34:35], 0, v41
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[34:35], 0, v43
 ; GCN-NEXT:    s_branch BB1_3
 ; GCN-NEXT:  BB1_1: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_3 Depth=1
@@ -228,7 +228,7 @@ define hidden void @blam() {
 ; GCN-NEXT:  BB1_4: ; %bb2
 ; GCN-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    flat_load_dword v0, v[42:43]
+; GCN-NEXT:    flat_load_dword v0, v[41:42]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
@@ -272,7 +272,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:  BB1_10: ; %bb17
 ; GCN-NEXT:    ; in Loop: Header=BB1_3 Depth=1
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
 ; GCN-NEXT:    s_branch BB1_2
 bb:
   %tmp = load float, float* null, align 16

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 1cd8d78d8459f..1822c76618f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -133,78 +133,78 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[0:1], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s0
-; GCN-IR-NEXT:    s_add_i32 s14, s12, 32
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s1
-; GCN-IR-NEXT:    s_min_u32 s10, s14, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_sub_u32 s8, s10, s14
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[8:9], 63
-; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[12:13], -1
+; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
+; GCN-IR-NEXT:    s_min_u32 s8, s12, s8
+; GCN-IR-NEXT:    s_min_u32 s12, s10, s11
+; GCN-IR-NEXT:    s_sub_u32 s10, s8, s12
+; GCN-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    s_mov_b32 s9, 0
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[16:17], s[10:11], 63
+; GCN-IR-NEXT:    s_xor_b64 s[18:19], s[14:15], -1
 ; GCN-IR-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[16:17]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s16, s8, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_addc_u32 s17, s9, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GCN-IR-NEXT:    s_add_u32 s14, s10, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-IR-NEXT:    s_addc_u32 s15, s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[6:7], s8
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[16:17], s[6:7], s16
-; GCN-IR-NEXT:    s_add_u32 s8, s0, -1
-; GCN-IR-NEXT:    s_addc_u32 s9, s1, -1
-; GCN-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
-; GCN-IR-NEXT:    s_mov_b32 s15, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s2, s14
-; GCN-IR-NEXT:    s_addc_u32 s11, s3, s11
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[14:15], s[6:7], s14
+; GCN-IR-NEXT:    s_add_u32 s16, s0, -1
+; GCN-IR-NEXT:    s_addc_u32 s17, s1, -1
+; GCN-IR-NEXT:    s_not_b64 s[2:3], s[8:9]
+; GCN-IR-NEXT:    s_mov_b32 s13, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s2, s12
+; GCN-IR-NEXT:    s_addc_u32 s9, s3, s9
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0
 ; GCN-IR-NEXT:  BB0_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s2, s13, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
-; GCN-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[2:3]
-; GCN-IR-NEXT:    s_or_b64 s[12:13], s[14:15], s[12:13]
-; GCN-IR-NEXT:    s_sub_u32 s2, s8, s16
-; GCN-IR-NEXT:    s_subb_u32 s2, s9, s17
-; GCN-IR-NEXT:    s_ashr_i32 s14, s2, 31
-; GCN-IR-NEXT:    s_mov_b32 s15, s14
-; GCN-IR-NEXT:    s_and_b32 s2, s14, 1
-; GCN-IR-NEXT:    s_and_b64 s[18:19], s[14:15], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s16, s16, s18
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-IR-NEXT:    s_subb_u32 s17, s17, s19
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
-; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-IR-NEXT:    s_lshr_b32 s2, s11, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s2, s16, s14
+; GCN-IR-NEXT:    s_subb_u32 s2, s17, s15
+; GCN-IR-NEXT:    s_ashr_i32 s12, s2, 31
+; GCN-IR-NEXT:    s_mov_b32 s13, s12
+; GCN-IR-NEXT:    s_and_b32 s2, s12, 1
+; GCN-IR-NEXT:    s_and_b64 s[18:19], s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_sub_u32 s14, s14, s18
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_subb_u32 s15, s15, s19
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
+; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB0_3
 ; GCN-IR-NEXT:  BB0_4: ; %Flow6
-; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[10:11], 1
 ; GCN-IR-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-IR-NEXT:    s_branch BB0_6
 ; GCN-IR-NEXT:  BB0_5:
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[14:15]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[12:13]
+; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
 ; GCN-IR-NEXT:  BB0_6: ; %udiv-end
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s0, v0
@@ -348,71 +348,71 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
-; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GCN-IR-NEXT:    v_min_u32_e32 v8, v4, v5
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v1
-; GCN-IR-NEXT:    v_min_u32_e32 v12, v4, v5
-; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v10, v12
+; GCN-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GCN-IR-NEXT:    v_sub_i32_e32 v5, vcc, v8, v10
 ; GCN-IR-NEXT:    v_subb_u32_e64 v6, s[6:7], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v9
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
-; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6]
-; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-IR-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[5:6]
+; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[0:1], v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB1_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    v_add_i32_e32 v6, vcc, -1, v2
-; GCN-IR-NEXT:    v_lshr_b64 v[14:15], v[0:1], v7
-; GCN-IR-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GCN-IR-NEXT:    v_not_b32_e32 v8, v10
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, 0
-; GCN-IR-NEXT:    v_not_b32_e32 v9, v11
-; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, v8, v12
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, v9, v13, vcc
+; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, -1, v2
+; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, -1, v3, vcc
+; GCN-IR-NEXT:    v_not_b32_e32 v6, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[12:13], v[0:1], v12
+; GCN-IR-NEXT:    v_not_b32_e32 v7, v9
+; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, v6, v10
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, v7, v11, vcc
 ; GCN-IR-NEXT:  BB1_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[12:13], v[14:15], 1
-; GCN-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v12, v12, v8
+; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[12:13], 1
+; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, v6, v12
-; GCN-IR-NEXT:    v_subb_u32_e32 v8, vcc, v7, v13, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v16, v4
-; GCN-IR-NEXT:    v_add_i32_e32 v16, vcc, 1, v10
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
-; GCN-IR-NEXT:    v_or_b32_e32 v5, v17, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v17, vcc, 0, v11, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v10, v16
-; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-IR-NEXT:    v_and_b32_e32 v8, 1, v14
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v14, v3
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v14, v2
-; GCN-IR-NEXT:    v_sub_i32_e64 v14, s[4:5], v12, v14
-; GCN-IR-NEXT:    v_mov_b32_e32 v11, v17
-; GCN-IR-NEXT:    v_mov_b32_e32 v17, v9
-; GCN-IR-NEXT:    v_subb_u32_e64 v15, s[4:5], v13, v15, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v16, v10
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v17, v11, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v14, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, 1, v8
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v5, v15, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, 0, v9, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9]
+; GCN-IR-NEXT:    v_mov_b32_e32 v8, v14
+; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GCN-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GCN-IR-NEXT:    v_and_b32_e32 v12, v12, v2
+; GCN-IR-NEXT:    v_sub_i32_e64 v12, s[4:5], v10, v12
+; GCN-IR-NEXT:    v_mov_b32_e32 v9, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v15, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v16, v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v14, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB1_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow
@@ -420,8 +420,8 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:  BB1_5: ; %Flow3
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-IR-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; GCN-IR-NEXT:    v_or_b32_e32 v7, v9, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v4, v8, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v7, v7, v5
+; GCN-IR-NEXT:    v_or_b32_e32 v4, v6, v4
 ; GCN-IR-NEXT:  BB1_6: ; %Flow4
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mul_lo_u32 v5, v2, v7
@@ -853,61 +853,61 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
 ; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_min_u32 s8, s4, s5
-; GCN-IR-NEXT:    s_add_u32 s6, s8, 0xffffffc5
-; GCN-IR-NEXT:    s_addc_u32 s7, 0, -1
+; GCN-IR-NEXT:    s_min_u32 s6, s4, s5
+; GCN-IR-NEXT:    s_add_u32 s8, s6, 0xffffffc5
+; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
-; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[6:7], 63
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[12:13], s[8:9], 63
 ; GCN-IR-NEXT:    s_xor_b64 s[14:15], s[10:11], -1
 ; GCN-IR-NEXT:    s_and_b64 s[12:13], s[14:15], s[12:13]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB6_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
-; GCN-IR-NEXT:    s_add_u32 s12, s6, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-IR-NEXT:    s_addc_u32 s13, s7, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1]
-; GCN-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1]
+; GCN-IR-NEXT:    s_sub_i32 s7, 63, s8
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, vcc
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], 24, s6
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], 24, s7
 ; GCN-IR-NEXT:    s_cbranch_vccz BB6_4
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
-; GCN-IR-NEXT:    s_lshr_b64 s[14:15], 24, s12
-; GCN-IR-NEXT:    s_add_u32 s6, s2, -1
-; GCN-IR-NEXT:    s_addc_u32 s7, s3, -1
-; GCN-IR-NEXT:    s_sub_u32 s8, 58, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GCN-IR-NEXT:    s_lshr_b64 s[12:13], 24, s10
+; GCN-IR-NEXT:    s_add_u32 s14, s2, -1
+; GCN-IR-NEXT:    s_addc_u32 s15, s3, -1
+; GCN-IR-NEXT:    s_sub_u32 s6, 58, s6
+; GCN-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], 0
 ; GCN-IR-NEXT:    s_mov_b32 s5, 0
 ; GCN-IR-NEXT:  BB6_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    s_lshr_b32 s4, s11, 31
-; GCN-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
-; GCN-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
-; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[4:5]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[12:13], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s4, s6, s14
-; GCN-IR-NEXT:    s_subb_u32 s4, s7, s15
-; GCN-IR-NEXT:    s_ashr_i32 s12, s4, 31
-; GCN-IR-NEXT:    s_mov_b32 s13, s12
-; GCN-IR-NEXT:    s_and_b32 s4, s12, 1
-; GCN-IR-NEXT:    s_and_b64 s[16:17], s[12:13], s[2:3]
-; GCN-IR-NEXT:    s_sub_u32 s14, s14, s16
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    s_subb_u32 s15, s15, s17
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
-; GCN-IR-NEXT:    s_add_u32 s8, s8, 1
-; GCN-IR-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GCN-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s4, s14, s12
+; GCN-IR-NEXT:    s_subb_u32 s4, s15, s13
+; GCN-IR-NEXT:    s_ashr_i32 s10, s4, 31
+; GCN-IR-NEXT:    s_mov_b32 s11, s10
+; GCN-IR-NEXT:    s_and_b32 s4, s10, 1
+; GCN-IR-NEXT:    s_and_b64 s[16:17], s[10:11], s[2:3]
+; GCN-IR-NEXT:    s_sub_u32 s12, s12, s16
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    s_subb_u32 s13, s13, s17
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-IR-NEXT:    s_add_u32 s6, s6, 1
+; GCN-IR-NEXT:    s_addc_u32 s7, s7, 0
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB6_3
 ; GCN-IR-NEXT:  BB6_4: ; %Flow5
-; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[10:11], 1
+; GCN-IR-NEXT:    s_lshl_b64 s[6:7], s[8:9], 1
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
@@ -1268,39 +1268,39 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz BB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
+; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
-; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v8
-; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, -1, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v4, vcc, 47, v4
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v8
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, 0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v5, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:  BB8_3: ; %udiv-do-while
 ; GCN-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-IR-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GCN-IR-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v3
-; GCN-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GCN-IR-NEXT:    v_or_b32_e32 v8, v8, v6
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v8, v10
-; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v9, v11, vcc
-; GCN-IR-NEXT:    v_or_b32_e32 v2, v12, v2
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
-; GCN-IR-NEXT:    v_and_b32_e32 v15, v12, v0
-; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v12
-; GCN-IR-NEXT:    v_and_b32_e32 v14, v12, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, 1, v4
-; GCN-IR-NEXT:    v_or_b32_e32 v3, v13, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5]
-; GCN-IR-NEXT:    v_mov_b32_e32 v4, v12
+; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, v12, v8
+; GCN-IR-NEXT:    v_subb_u32_e32 v6, vcc, v13, v9, vcc
+; GCN-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GCN-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GCN-IR-NEXT:    v_and_b32_e32 v15, v10, v0
+; GCN-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GCN-IR-NEXT:    v_and_b32_e32 v14, v10, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GCN-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v11, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
+; GCN-IR-NEXT:    v_mov_b32_e32 v4, v10
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    v_sub_i32_e64 v10, s[4:5], v10, v15
-; GCN-IR-NEXT:    v_mov_b32_e32 v5, v13
-; GCN-IR-NEXT:    v_mov_b32_e32 v13, v7
-; GCN-IR-NEXT:    v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5]
+; GCN-IR-NEXT:    v_sub_i32_e64 v8, s[4:5], v8, v15
+; GCN-IR-NEXT:    v_mov_b32_e32 v5, v11
+; GCN-IR-NEXT:    v_mov_b32_e32 v11, v7
+; GCN-IR-NEXT:    v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; GCN-IR-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-IR-NEXT:    v_mov_b32_e32 v10, v6
 ; GCN-IR-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GCN-IR-NEXT:    s_cbranch_execnz BB8_3
 ; GCN-IR-NEXT:  ; %bb.4: ; %Flow


        


More information about the llvm-commits mailing list