[llvm] [AMDGPU][NFC] Test autogenerated llc tests for COV5 (PR #74339)

Saiyedul Islam via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 4 08:51:17 PST 2023


https://github.com/saiislam created https://github.com/llvm/llvm-project/pull/74339

Regenerate a few llc tests to test for COV5 instead of the default ABI version.

>From 5e8861902b3e0d37e0461e2681b9e151195325da Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam at amd.com>
Date: Mon, 4 Dec 2023 10:48:21 -0600
Subject: [PATCH] [AMDGPU][NFC] Test autogenerated llc tests for COV5

Regenerate a few llc tests to test for COV5 instead of the
default ABI version.
---
 .../abi-attribute-hints-undefined-behavior.ll |  112 +-
 .../test/CodeGen/AMDGPU/addrspacecast.gfx6.ll |   69 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |   63 +-
 .../annotate-kernel-features-hsa-call.ll      |   31 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   22 +-
 ...der-no-live-segment-at-def-implicit-def.ll |   70 +-
 .../branch-folding-implicit-def-subreg.ll     |  779 ++---
 .../CodeGen/AMDGPU/call-argument-types.ll     | 2683 ++++++++---------
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll      |   92 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   23 +-
 llvm/test/CodeGen/AMDGPU/cc-update.ll         |  307 +-
 .../CodeGen/AMDGPU/cf-loop-on-constant.ll     |   93 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |   23 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   63 +-
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |   43 +-
 llvm/test/CodeGen/AMDGPU/ds_read2.ll          |   49 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   15 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll |   39 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 1158 ++++---
 .../AMDGPU/global_atomics_scan_fmax.ll        |  696 ++---
 .../AMDGPU/global_atomics_scan_fmin.ll        |  696 ++---
 .../AMDGPU/global_atomics_scan_fsub.ll        | 1157 ++++---
 22 files changed, 3991 insertions(+), 4292 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index c7bc584d42d5a..121656b6a30f2 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -276,16 +276,16 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 ; FIXEDABI-SDAG-LABEL: addrspacecast_requires_queue_ptr:
 ; FIXEDABI-SDAG:       ; %bb.0:
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x40
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; FIXEDABI-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
-; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v2, s5
-; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
 ; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
-; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, s4
-; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
-; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v0, vcc
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, 1
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v4, s4
+; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
+; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[2:3], v0
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -297,12 +297,15 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 ; FIXEDABI-GISEL-LABEL: addrspacecast_requires_queue_ptr:
 ; FIXEDABI-GISEL:       ; %bb.0:
 ; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x40
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc0
+; FIXEDABI-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc4
+; FIXEDABI-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; FIXEDABI-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
 ; FIXEDABI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v0, s5
-; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v0, vcc
+; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v3, s6
+; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
 ; FIXEDABI-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v1
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
@@ -322,16 +325,29 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 }
 
 define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
-; FIXEDABI-LABEL: is_shared_requires_queue_ptr:
-; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT:    s_load_dword s4, s[6:7], 0x40
-; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIXEDABI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
-; FIXEDABI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; FIXEDABI-NEXT:    flat_store_dword v[0:1], v0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
+; FIXEDABI-SDAG-LABEL: is_shared_requires_queue_ptr:
+; FIXEDABI-SDAG:       ; %bb.0:
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; FIXEDABI-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
+; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; FIXEDABI-SDAG-NEXT:    flat_store_dword v[0:1], v0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; FIXEDABI-GISEL-LABEL: is_shared_requires_queue_ptr:
+; FIXEDABI-GISEL:       ; %bb.0:
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc4
+; FIXEDABI-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
+; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; FIXEDABI-GISEL-NEXT:    flat_store_dword v[0:1], v0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   %zext = zext i1 %is.shared to i32
   store volatile i32 %zext, ptr addrspace(1) undef
@@ -339,16 +355,29 @@ define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
 }
 
 define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
-; FIXEDABI-LABEL: is_private_requires_queue_ptr:
-; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT:    s_load_dword s4, s[6:7], 0x44
-; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIXEDABI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
-; FIXEDABI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; FIXEDABI-NEXT:    flat_store_dword v[0:1], v0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
+; FIXEDABI-SDAG-LABEL: is_private_requires_queue_ptr:
+; FIXEDABI-SDAG:       ; %bb.0:
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; FIXEDABI-SDAG-NEXT:    s_load_dword s4, s[4:5], 0x0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
+; FIXEDABI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; FIXEDABI-SDAG-NEXT:    flat_store_dword v[0:1], v0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; FIXEDABI-GISEL-LABEL: is_private_requires_queue_ptr:
+; FIXEDABI-GISEL:       ; %bb.0:
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc0
+; FIXEDABI-GISEL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
+; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; FIXEDABI-GISEL-NEXT:    flat_store_dword v[0:1], v0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   %zext = zext i1 %is.private to i32
   store volatile i32 %zext, ptr addrspace(1) undef
@@ -356,11 +385,21 @@ define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
 }
 
 define void @trap_requires_queue() #0 {
-; FIXEDABI-LABEL: trap_requires_queue:
-; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FIXEDABI-NEXT:    s_mov_b64 s[0:1], s[6:7]
-; FIXEDABI-NEXT:    s_trap 2
+; FIXEDABI-SDAG-LABEL: trap_requires_queue:
+; FIXEDABI-SDAG:       ; %bb.0:
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; FIXEDABI-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_trap 2
+;
+; FIXEDABI-GISEL-LABEL: trap_requires_queue:
+; FIXEDABI-GISEL:       ; %bb.0:
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_mov_b64 s[4:5], 0xc8
+; FIXEDABI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_trap 2
   call void @llvm.trap()
   unreachable
 }
@@ -390,3 +429,6 @@ declare void @llvm.trap()
 declare void @llvm.debugtrap()
 
 attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
index 2d60faaf8c8c0..e962df69b45c1 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
@@ -21,16 +21,29 @@ define ptr addrspace(1) @flat_to_gobal_addrspacecast(ptr %ptr) {
 }
 
 define ptr @group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
-; CHECK-LABEL: group_to_flat_addrspacecast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s4, s[6:7], 0x10
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: group_to_flat_addrspacecast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; SDAG-NEXT:    s_load_dword s4, s[4:5], 0x0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: group_to_flat_addrspacecast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0xc4
+; GISEL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   ret ptr %stof
 }
@@ -47,16 +60,29 @@ define ptr addrspace(3) @flat_to_group_addrspacecast(ptr %ptr) {
 }
 
 define ptr @private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
-; CHECK-LABEL: private_to_flat_addrspacecast:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_load_dword s4, s[6:7], 0x11
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s4
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: private_to_flat_addrspacecast:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; SDAG-NEXT:    s_load_dword s4, s[4:5], 0x0
+; SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: private_to_flat_addrspacecast:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0xc0
+; GISEL-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   ret ptr %stof
 }
@@ -204,3 +230,6 @@ define ptr addrspace(6) @addrspacecast_flat_null_to_constant32bit() {
 }
 
 attributes #0 = { "amdgpu-32bit-address-high-bits"="0xffff8000" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 8c64ab5952def..e53fa9650f9ec 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -146,13 +146,13 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -163,9 +163,9 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -285,13 +285,13 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v43, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -302,9 +302,9 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -430,13 +430,13 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v42, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -447,9 +447,9 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -571,13 +571,13 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -588,9 +588,9 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -715,13 +715,13 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    v_or_b32_e32 v43, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -732,9 +732,9 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -773,3 +773,6 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 1f90c0d03a856..1396dab69c13a 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -458,7 +458,7 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR8]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -477,7 +477,7 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -496,7 +496,7 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_indirect_use_queue_ptr()
@@ -515,7 +515,7 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR14:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -1010,6 +1010,9 @@ attributes #4 = { nounwind sanitize_address }
 attributes #5 = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" }
 attributes #6 = { "enqueued-block" }
 
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 ;.
 ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" }
@@ -1029,14 +1032,14 @@ attributes #6 = { "enqueued-block" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
@@ -1052,3 +1055,7 @@ attributes #6 = { "enqueued-block" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
 ;.
+; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
+;.
+; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 9e5df73d69a07..e0139fda653f8 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -424,7 +424,7 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -442,7 +442,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -534,7 +534,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -555,7 +555,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -621,7 +621,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -634,6 +634,9 @@ define void @use_alloca_func() #1 {
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+
 ;.
 ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
@@ -650,6 +653,11 @@ attributes #1 = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+;.
+; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
+;.
+; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index a965a20244b4e..7c8d40c49bb80 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -4,15 +4,16 @@
 define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %extractVec358.i.i, i32 %cmp5.i.i.arg, float %i1.i, i32 %cmp221.i.i.arg, i32 %cmp262.i.i.arg, ptr addrspace(1) %arg) {
 ; CHECK-LABEL: blender_no_live_segment_at_def_error:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_add_u32 s12, s12, s17
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s13, s13, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[8:9], 0x0
-; CHECK-NEXT:    s_add_u32 s0, s0, s17
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_load_dwordx8 s[36:43], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b32 s12, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s40, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_8
@@ -21,54 +22,50 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_4
 ; CHECK-NEXT:  ; %bb.2: ; %if.else251.i.i
 ; CHECK-NEXT:    s_cmp_lg_u32 s43, 0
-; CHECK-NEXT:    s_mov_b32 s17, 0
-; CHECK-NEXT:    s_cselect_b32 s12, -1, 0
-; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT:    s_mov_b32 s15, 0
+; CHECK-NEXT:    s_cselect_b32 s8, -1, 0
+; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_5
 ; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    s_mov_b32 s36, 0
-; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_6
 ; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    s_mov_b32 s14, s12
-; CHECK-NEXT:    s_mov_b32 s15, s12
-; CHECK-NEXT:    s_mov_b32 s13, s12
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT:    s_mov_b32 s10, s8
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    s_mov_b32 s9, s8
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT:    v_cmp_lt_f32_e64 s12, s41, 0
+; CHECK-NEXT:    v_cmp_lt_f32_e64 s8, s41, 0
 ; CHECK-NEXT:    s_mov_b32 s36, 1.0
-; CHECK-NEXT:    s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT:    s_mov_b32 s15, 0x7fc00000
 ; CHECK-NEXT:    s_mov_b32 s37, s36
 ; CHECK-NEXT:    s_mov_b32 s38, s36
 ; CHECK-NEXT:    s_mov_b32 s39, s36
-; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s8
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_7
 ; CHECK-NEXT:  .LBB0_6: ; %if.end273.i.i
-; CHECK-NEXT:    s_add_u32 s12, s8, 40
-; CHECK-NEXT:    s_addc_u32 s13, s9, 0
-; CHECK-NEXT:    s_getpc_b64 s[18:19]
-; CHECK-NEXT:    s_add_u32 s18, s18, _Z3dotDv3_fS_ at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s19, s19, _Z3dotDv3_fS_ at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s8, s6, 40
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, _Z3dotDv3_fS_ at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3dotDv3_fS_ at gotpcrel32@hi+12
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[18:19], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 10, v1
-; CHECK-NEXT:    v_add_f32_e64 v1, s17, s36
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[12:13]
-; CHECK-NEXT:    s_mov_b32 s12, s14
+; CHECK-NEXT:    v_add_f32_e64 v1, s15, s36
+; CHECK-NEXT:    s_mov_b32 s36, 0
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    v_or3_b32 v31, v0, v3, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    s_mov_b32 s13, s15
-; CHECK-NEXT:    s_mov_b32 s14, s16
-; CHECK-NEXT:    s_mov_b32 s36, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[34:35]
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
 ; CHECK-NEXT:    s_mov_b32 s37, s36
 ; CHECK-NEXT:    s_mov_b32 s38, s36
 ; CHECK-NEXT:    s_mov_b32 s39, s36
@@ -79,7 +76,7 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; CHECK-NEXT:  .LBB0_8: ; %kernel_direct_lighting.exit
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x20
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x20
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s36
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s37
@@ -125,3 +122,6 @@ kernel_direct_lighting.exit:                      ; preds = %if.end294.i.i, %ent
 }
 
 declare float @_Z3dotDv3_fS_(<3 x float>)
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index afd43de2eaafd..5a128c7541d1e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -5,95 +5,96 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
   ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr17, $sgpr12_sgpr13
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
-  ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr12, $sgpr17, implicit-def $scc
-  ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc
-  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr17, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   $flat_scr_lo = S_ADD_U32 $sgpr10, $sgpr15, implicit-def $scc
+  ; GFX90A-NEXT:   $flat_scr_hi = S_ADDC_U32 $sgpr11, 0, implicit-def dead $scc, implicit $scc
+  ; GFX90A-NEXT:   $sgpr0 = S_ADD_U32 $sgpr0, $sgpr15, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
   ; GFX90A-NEXT:   renamable $vgpr31 = COPY $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25 = S_XOR_B64 renamable $sgpr8_sgpr9, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr33, 8, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr26_sgpr27 = S_XOR_B64 killed renamable $sgpr26_sgpr27, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
-  ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.2(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
   ; GFX90A-NEXT:   $vgpr22 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr24 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr20 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.59, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
-  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.58(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4 = COPY renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr16, $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.35, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.5:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -112,123 +113,120 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr21 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr23 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24 = COPY $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr23 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr24 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
-  ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   successors: %bb.63(0x40000000), %bb.8(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
-  ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.62, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.63, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
   ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.16, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.15.bb72:
   ; GFX90A-NEXT:   successors: %bb.16(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LOAD_DWORDX2_IMM killed renamable $sgpr12_sgpr13, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; GFX90A-NEXT:   $sgpr12 = COPY killed renamable $sgpr14
-  ; GFX90A-NEXT:   $sgpr13 = COPY killed renamable $sgpr15
-  ; GFX90A-NEXT:   $sgpr14 = COPY killed renamable $sgpr16
-  ; GFX90A-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr6, 48, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr7, 0, implicit-def dead $scc, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM killed renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; GFX90A-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.16.Flow36:
   ; GFX90A-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.18, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.17.bb67:
   ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.18.Flow37:
   ; GFX90A-NEXT:   successors: %bb.19(0x40000000), %bb.20(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -237,15 +235,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.19.bb62:
   ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.20.Flow38:
   ; GFX90A-NEXT:   successors: %bb.21(0x40000000), %bb.22(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -254,15 +252,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.21.bb54:
   ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.22.Flow39:
   ; GFX90A-NEXT:   successors: %bb.23(0x40000000), %bb.24(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -271,58 +269,58 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.23.bb47:
   ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.24.Flow40:
   ; GFX90A-NEXT:   successors: %bb.25(0x40000000), %bb.26(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.26, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.25.bb40:
   ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.26.Flow41:
   ; GFX90A-NEXT:   successors: %bb.27(0x40000000), %bb.28(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.28, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.27.bb33:
   ; GFX90A-NEXT:   successors: %bb.28(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.28.Flow42:
   ; GFX90A-NEXT:   successors: %bb.34(0x40000000), %bb.29(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.34, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.29.Flow43:
   ; GFX90A-NEXT:   successors: %bb.30(0x40000000), %bb.31(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
@@ -330,17 +328,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.30.bb19:
   ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.31.Flow44:
   ; GFX90A-NEXT:   successors: %bb.32(0x40000000), %bb.33(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr56_sgpr57, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.33, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock:
@@ -356,32 +354,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.34.bb26:
   ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr56_sgpr57, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.29
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -399,27 +397,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -436,39 +436,39 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.36
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
@@ -486,44 +486,43 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
-  ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   successors: %bb.47(0x40000000), %bb.42(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
-  ; GFX90A-NEXT:   renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = COPY $vcc
+  ; GFX90A-NEXT:   renamable $vgpr59, dead renamable $sgpr16_sgpr17 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1)
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -538,49 +537,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.47, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr56_sgpr57, killed renamable $sgpr60_sgpr61, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr16_sgpr17, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
-  ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   successors: %bb.49(0x40000000), %bb.44(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc
-  ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr12 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.49, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr18, $vgpr30, $vgpr31, $vgpr61, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr10, $vgpr12
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -593,41 +591,47 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
-  ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.47
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.46.bb48:
-  ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.46.Flow26:
+  ; GFX90A-NEXT:   successors: %bb.48(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.48
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.47.bb48:
+  ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.48(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = COPY $vcc
   ; GFX90A-NEXT:   renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1)
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr28_sgpr29
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr16_sgpr17, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -641,57 +645,71 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.47.Flow25:
+  ; GFX90A-NEXT: bb.48.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.42
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.48.bb63:
-  ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT: bb.49.bb63:
+  ; GFX90A-NEXT:   successors: %bb.51(0x40000000), %bb.50(0x40000000)
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.51, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.49:
-  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT: bb.50:
+  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   S_BRANCH %bb.44
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
+  ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.50.bb68:
-  ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT: bb.51.bb68:
+  ; GFX90A-NEXT:   successors: %bb.55(0x40000000), %bb.52(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.55, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.51:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT: bb.52:
+  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -703,26 +721,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.45
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   S_BRANCH %bb.46
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.52.bb80:
-  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.53.bb80:
+  ; GFX90A-NEXT:   successors: %bb.60(0x40000000), %bb.54(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr48_sgpr49 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.60, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.53:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.54:
+  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
@@ -732,21 +750,21 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.61
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   S_BRANCH %bb.62
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.54.bb73:
-  ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53
+  ; GFX90A-NEXT: bb.55.bb73:
+  ; GFX90A-NEXT:   successors: %bb.53(0x40000000), %bb.56(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
+  ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
@@ -757,53 +775,53 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.53, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.55.Flow29:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.56.Flow29:
+  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.45
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.46
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.56.bb90:
-  ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.57.bb90:
+  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr54, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr55, killed $vgpr10, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr8_sgpr9, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.60
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.57:
+  ; GFX90A-NEXT: bb.58:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr12, $sgpr13, $sgpr14, $sgpr15:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
@@ -825,9 +843,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.58.bb105:
+  ; GFX90A-NEXT: bb.59.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
@@ -835,26 +853,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
+  ; GFX90A-NEXT:   renamable $sgpr15 = S_MOV_B32 0
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.59.bb85:
-  ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.60.bb85:
+  ; GFX90A-NEXT:   successors: %bb.57(0x40000000), %bb.61(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
-  ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr15 = S_MOV_B32 0
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr28_sgpr29
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
@@ -863,70 +881,70 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
+  ; GFX90A-NEXT:   $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.57, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.60.Flow31:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.61.Flow31:
+  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.61.Flow30:
-  ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.62.Flow30:
+  ; GFX90A-NEXT:   successors: %bb.56(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.55
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_BRANCH %bb.56
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.62.bb140:
-  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.63.bb140:
+  ; GFX90A-NEXT:   successors: %bb.69(0x40000000), %bb.64(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.68, implicit $vcc
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.69, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.63.Flow13:
-  ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.64.Flow13:
+  ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.67(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
+  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.67, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.64.bb159:
-  ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.65.bb159:
+  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.66(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.67, implicit $exec
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.68, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.65.Flow10:
-  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.66.Flow10:
+  ; GFX90A-NEXT:   successors: %bb.67(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.66.Flow14:
+  ; GFX90A-NEXT: bb.67.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY $exec
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.67.bb161:
-  ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.68.bb161:
+  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
@@ -941,38 +959,38 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
-  ; GFX90A-NEXT:   S_BRANCH %bb.65
+  ; GFX90A-NEXT:   S_BRANCH %bb.66
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.68.bb174:
-  ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.69.bb174:
+  ; GFX90A-NEXT:   successors: %bb.73(0x40000000), %bb.70(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr8_sgpr9, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr8_sgpr9, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.73, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.69.Flow:
-  ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.70.Flow:
+  ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.72(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
+  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.70.bb186:
-  ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.71.bb186:
+  ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr19, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr18, $vgpr2, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
@@ -995,23 +1013,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.71.Flow9:
-  ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.72.Flow9:
+  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.63
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 0
+  ; GFX90A-NEXT:   S_BRANCH %bb.64
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.72.bb196:
-  ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.73.bb196:
+  ; GFX90A-NEXT:   successors: %bb.70(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.69
+  ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_MOV_B64 0
+  ; GFX90A-NEXT:   S_BRANCH %bb.70
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i11 = icmp eq i32 %i, 0
@@ -1312,3 +1330,6 @@ bb196:
 declare void @f2(i64)
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e2..a192a1b8dff93 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -67,9 +67,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -86,9 +85,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -105,9 +103,8 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -121,29 +118,27 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_i1_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i1(i1 true)
   ret void
@@ -152,17 +147,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i1_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -175,17 +169,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i1_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -198,17 +191,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -223,35 +215,33 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_signext at rel32@hi+12
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_signext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
 ; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_signext(i1 signext %var)
@@ -262,17 +252,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i1_zeroext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -285,17 +274,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i1_zeroext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -308,17 +296,16 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -333,35 +320,33 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i1_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_zeroext at rel32@hi+12
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i1_zeroext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_zeroext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
 ; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_zeroext(i1 zeroext %var)
@@ -375,9 +360,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -394,9 +378,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -413,9 +396,8 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -429,29 +411,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-LABEL: test_call_external_void_func_i8_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i8_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i8(i8 123)
   ret void
@@ -461,17 +441,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i8_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -483,17 +462,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i8_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -505,17 +483,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i8_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -529,34 +506,32 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_i8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_signext at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i8_signext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_signext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_signext at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) undef
   call void @external_void_func_i8_signext(i8 signext %var)
@@ -566,17 +541,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i8_zeroext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -588,17 +562,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i8_zeroext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -610,17 +583,16 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -634,34 +606,32 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i8_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i8_zeroext at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i8_zeroext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_zeroext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_zeroext at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) undef
   call void @external_void_func_i8_zeroext(i8 zeroext %var)
@@ -675,9 +645,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -694,9 +663,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -713,9 +681,8 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -729,29 +696,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_i16_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i16_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i16(i16 123)
   ret void
@@ -760,17 +725,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i16_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -782,17 +746,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i16_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -804,17 +767,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i16_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -828,34 +790,32 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_i16 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_signext at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i16_signext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_signext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_signext at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) undef
   call void @external_void_func_i16_signext(i16 signext %var)
@@ -865,17 +825,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_i16_zeroext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -887,17 +846,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_i16_zeroext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
-; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -909,17 +867,16 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -933,34 +890,32 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i16_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i16_zeroext at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i16_zeroext:
 ; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_zeroext at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_zeroext at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) undef
   call void @external_void_func_i16_zeroext(i16 zeroext %var)
@@ -974,9 +929,8 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
@@ -993,9 +947,8 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
@@ -1012,9 +965,8 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
@@ -1028,29 +980,27 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-LABEL: test_call_external_void_func_i32_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i32(i32 42)
   ret void
@@ -1063,9 +1013,8 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -1083,9 +1032,8 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -1103,9 +1051,8 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
@@ -1120,30 +1067,28 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_i64_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_i64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_i64_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_i64(i64 123)
   ret void
@@ -1156,9 +1101,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s1, s0
@@ -1179,9 +1123,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b32 s0, 0
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s1, s0
@@ -1202,9 +1145,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s1, s0
@@ -1221,38 +1163,36 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2i64:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s8, 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 s9, s8
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s4, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    s_mov_b32 s5, s4
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <2 x i64>, ptr addrspace(1) null
   call void @external_void_func_v2i64(<2 x i64> %val)
@@ -1266,9 +1206,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -1288,9 +1227,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -1310,9 +1248,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -1330,32 +1267,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2i64_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
   ret void
@@ -1368,9 +1303,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s1, s0
@@ -1393,9 +1327,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b32 s0, 0
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s1, s0
@@ -1418,9 +1351,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s1, s0
@@ -1439,41 +1371,39 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3i64:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s8, 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 s9, s8
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s4, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    s_mov_b32 s5, s4
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -1489,9 +1419,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s1, s0
@@ -1516,9 +1445,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b32 s0, 0
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_mov_b32 s1, s0
@@ -1543,9 +1471,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_mov_b32 s1, s0
@@ -1566,44 +1493,42 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v4i64:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_mov_b32 s8, 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 s9, s8
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_mov_b32 s4, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    s_mov_b32 s5, s4
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 1
 ; HSA-NEXT:    v_mov_b32_e32 v5, 2
 ; HSA-NEXT:    v_mov_b32_e32 v6, 3
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v7, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1618,9 +1543,8 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
@@ -1637,9 +1561,8 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 4.0
@@ -1656,9 +1579,8 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
@@ -1672,29 +1594,27 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_f16_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_f16_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_f16(half 4.0)
   ret void
@@ -1707,9 +1627,8 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 4.0
@@ -1726,9 +1645,8 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 4.0
@@ -1745,9 +1663,8 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
@@ -1761,29 +1678,27 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_f32_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 4.0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_f32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_f32(float 4.0)
   ret void
@@ -1796,9 +1711,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1816,9 +1730,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1836,9 +1749,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1853,30 +1765,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2f32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
   ret void
@@ -1889,9 +1799,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1910,9 +1819,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1931,9 +1839,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -1950,31 +1857,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 4.0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3f32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
   ret void
@@ -1987,9 +1892,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -2010,9 +1914,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -2033,9 +1936,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -2055,33 +1957,31 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0.5
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5f32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v5f32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
 ; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5f32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5f32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
@@ -2094,9 +1994,8 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2114,9 +2013,8 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2134,9 +2032,8 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2151,30 +2048,28 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_f64_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_f64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_f64_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_f64(double 4.0)
   ret void
@@ -2187,9 +2082,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2209,9 +2103,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2231,9 +2124,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2251,32 +2143,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2f64_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
 ; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
   ret void
@@ -2289,9 +2179,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2313,9 +2202,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
@@ -2337,9 +2225,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -2360,23 +2247,20 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f64 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3f64_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
 ; HSA-NEXT:    v_mov_b32_e32 v2, 0
@@ -2384,10 +2268,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v4, 0
 ; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f64 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f64 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
   ret void
@@ -2396,16 +2281,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; VI-LABEL: test_call_external_void_func_v2i16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -2417,16 +2301,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v2i16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -2440,16 +2323,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v2i16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -2463,32 +2345,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2i16:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <2 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v2i16(<2 x i16> %val)
@@ -2498,16 +2378,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; VI-LABEL: test_call_external_void_func_v3i16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -2519,16 +2398,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v3i16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -2544,16 +2422,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v3i16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -2567,32 +2444,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3i16:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <3 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v3i16(<3 x i16> %val)
@@ -2602,16 +2477,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; VI-LABEL: test_call_external_void_func_v3f16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -2623,16 +2497,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v3f16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -2642,23 +2515,22 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_v3f16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -2672,32 +2544,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3f16:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) undef
   call void @external_void_func_v3f16(<3 x half> %val)
@@ -2711,9 +2581,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
@@ -2731,9 +2600,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -2752,9 +2620,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
@@ -2769,30 +2636,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3i16_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 3
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
   ret void
@@ -2805,9 +2670,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x40003c00
@@ -2825,9 +2689,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -2846,9 +2709,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
@@ -2864,30 +2726,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4400
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3f16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3f16_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
   ret void
@@ -2896,16 +2756,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; VI-LABEL: test_call_external_void_func_v4i16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -2917,16 +2776,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v4i16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -2943,16 +2801,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v4i16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -2966,32 +2823,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v4i16:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <4 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v4i16(<4 x i16> %val)
@@ -3005,9 +2860,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
@@ -3025,9 +2879,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3047,9 +2900,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
@@ -3065,30 +2917,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40003
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v4i16_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
   ret void
@@ -3097,16 +2947,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; VI-LABEL: test_call_external_void_func_v2f16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -3118,16 +2967,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v2f16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -3143,16 +2991,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v2f16:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -3166,32 +3013,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2f16 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2f16:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f16 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f16 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) undef
   call void @external_void_func_v2f16(<2 x half> %val)
@@ -3201,16 +3046,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; VI-LABEL: test_call_external_void_func_v2i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -3222,16 +3066,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v2i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -3243,16 +3086,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v2i32:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -3266,32 +3108,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <2 x i32>, ptr addrspace(1) undef
   call void @external_void_func_v2i32(<2 x i32> %val)
@@ -3305,9 +3145,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3325,9 +3164,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3345,9 +3183,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -3362,30 +3199,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v2i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v2i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v2i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
   ret void
@@ -3398,9 +3233,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
@@ -3419,9 +3253,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
@@ -3440,9 +3273,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
@@ -3459,31 +3291,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 5
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
   ret void
@@ -3496,9 +3326,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
@@ -3518,9 +3347,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
@@ -3540,9 +3368,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
@@ -3560,32 +3387,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v3i32_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v3i32_i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    v_mov_b32_e32 v1, 4
 ; HSA-NEXT:    v_mov_b32_e32 v2, 5
 ; HSA-NEXT:    v_mov_b32_e32 v3, 6
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
   ret void
@@ -3594,16 +3419,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; VI-LABEL: test_call_external_void_func_v4i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -3615,16 +3439,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ;
 ; CI-LABEL: test_call_external_void_func_v4i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
-; CI-NEXT:    s_mov_b32 s3, 0xf000
-; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -3636,16 +3459,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ;
 ; GFX9-LABEL: test_call_external_void_func_v4i32:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
-; GFX9-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -3659,32 +3481,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v4i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = load <4 x i32>, ptr addrspace(1) undef
   call void @external_void_func_v4i32(<4 x i32> %val)
@@ -3698,9 +3518,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3720,9 +3539,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3742,9 +3560,8 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -3762,32 +3579,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v4i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v4i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v4i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
   ret void
@@ -3800,9 +3615,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3823,9 +3637,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -3846,9 +3659,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -3868,33 +3680,31 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 5
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v5i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v5i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v5i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
 ; HSA-NEXT:    v_mov_b32_e32 v3, 4
 ; HSA-NEXT:    v_mov_b32_e32 v4, 5
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
@@ -3904,18 +3714,17 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; VI-LABEL: test_call_external_void_func_v8i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -3928,18 +3737,17 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; CI-LABEL: test_call_external_void_func_v8i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -3952,18 +3760,17 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -3975,40 +3782,39 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v8i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i32>, ptr addrspace(1) %ptr
@@ -4023,9 +3829,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    v_mov_b32_e32 v0, 1
@@ -4049,9 +3854,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    v_mov_b32_e32 v0, 1
@@ -4075,9 +3879,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
@@ -4101,23 +3904,20 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
 ; GFX11-NEXT:    v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v8i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v8i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v8i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 1
 ; HSA-NEXT:    v_mov_b32_e32 v1, 2
 ; HSA-NEXT:    v_mov_b32_e32 v2, 3
@@ -4127,10 +3927,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v6, 7
 ; HSA-NEXT:    v_mov_b32_e32 v7, 8
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
@@ -4140,20 +3941,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; VI-LABEL: test_call_external_void_func_v16i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -4166,20 +3966,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; CI-LABEL: test_call_external_void_func_v16i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -4192,20 +3991,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
 ; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -4217,44 +4015,43 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v16i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i32>, ptr addrspace(1) %ptr
@@ -4265,152 +4062,147 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-LABEL: test_call_external_void_func_v32i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    s_getpc_b64 s[8:9]
-; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
 ; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
-; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: test_call_external_void_func_v32i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    s_getpc_b64 s[8:9]
-; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
 ; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
-; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: test_call_external_void_func_v32i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32 at rel32@hi+12
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
-; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
-; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
 ; GFX11-NEXT:    scratch_store_b32 off, v31, s32
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v32i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
-; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
-; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT:    s_getpc_b64 s[12:13]
-; HSA-NEXT:    s_add_u32 s12, s12, external_void_func_v32i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s13, s13, external_void_func_v32i32 at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
 ; HSA-NEXT:    s_waitcnt vmcnt(7)
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <32 x i32>, ptr addrspace(1) %ptr
@@ -4422,25 +4214,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; VI-LABEL: test_call_external_void_func_v32i32_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
-; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], 0
+; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4457,25 +4248,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; CI-LABEL: test_call_external_void_func_v32i32_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
-; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4492,25 +4282,24 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4526,63 +4315,61 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
-; GFX11-NEXT:    buffer_load_b32 v32, off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
-; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
-; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s32, 4
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    buffer_load_b32 v32, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_add_i32 s2, s32, 4
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b32 off, v31, s32
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b32 off, v32, s4
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    scratch_store_b32 off, v32, s2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v32i32_i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_add_u32 s0, s0, s11
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
-; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
-; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
+; HSA-NEXT:    buffer_load_dword v32, off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32_i32 at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
 ; HSA-NEXT:    s_waitcnt vmcnt(8)
 ; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    s_waitcnt vmcnt(8)
 ; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
   %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
@@ -4598,10 +4385,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; VI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s42, -1
 ; VI-NEXT:    s_mov_b32 s43, 0xe80000
-; VI-NEXT:    s_add_u32 s40, s40, s5
-; VI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; VI-NEXT:    s_add_u32 s40, s40, s3
+; VI-NEXT:    s_load_dwordx2 s[36:37], s[0:1], 0x24
 ; VI-NEXT:    s_addc_u32 s41, s41, 0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
@@ -4622,10 +4408,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; CI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s42, -1
 ; CI-NEXT:    s_mov_b32 s43, 0xe8f000
-; CI-NEXT:    s_add_u32 s40, s40, s5
-; CI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; CI-NEXT:    s_add_u32 s40, s40, s3
+; CI-NEXT:    s_load_dwordx2 s[36:37], s[0:1], 0x9
 ; CI-NEXT:    s_addc_u32 s41, s41, 0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
@@ -4646,10 +4431,9 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s42, -1
 ; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
-; GFX9-NEXT:    s_add_u32 s40, s40, s5
-; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; GFX9-NEXT:    s_add_u32 s40, s40, s3
+; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[0:1], 0x24
 ; GFX9-NEXT:    s_addc_u32 s41, s41, 0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
@@ -4666,17 +4450,16 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[36:37], s[2:3], 0x24
+; GFX11-NEXT:    s_load_b64 s[36:37], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_mov_b32 s39, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s38, -1
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_i32_func_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_i32_func_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[36:39], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_nop 0
@@ -4685,21 +4468,20 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
 ;
 ; HSA-LABEL: test_call_external_i32_func_i32_imm:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_load_dwordx2 s[36:37], s[6:7], 0x0
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, 42
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
 ; HSA-NEXT:    s_mov_b32 s38, -1
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_i32_func_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_i32_func_i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    buffer_store_dword v0, off, s[36:39], 0
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_endpgm
@@ -4712,18 +4494,17 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; VI-LABEL: test_call_external_void_func_struct_i8_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
-; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_mov_b32 s32, 0
@@ -4736,18 +4517,17 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; CI-LABEL: test_call_external_void_func_struct_i8_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
-; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
@@ -4760,18 +4540,17 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
@@ -4783,40 +4562,39 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0 offset:4
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_struct_i8_i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; HSA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; HSA-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:4
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_struct_i8_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_struct_i8_i32 at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i8, i32 }, ptr addrspace(1) %ptr0
@@ -4831,7 +4609,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
 ; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -4839,7 +4617,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
 ; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
 ; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_movk_i32 s32, 0x400
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4859,7 +4636,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
 ; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -4867,7 +4644,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
 ; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
 ; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_movk_i32 s32, 0x400
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4887,7 +4663,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -4896,7 +4672,6 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_movk_i32 s32, 0x400
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
@@ -4914,24 +4689,23 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
 ; GFX11-NEXT:    s_mov_b32 s32, 16
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s7
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -4940,16 +4714,15 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x400
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %val = alloca { i8, i32 }, align 8, addrspace(5)
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
@@ -4967,7 +4740,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_add_u32 s36, s36, s3
 ; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, 3
 ; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -4976,7 +4749,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
 ; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
 ; VI-NEXT:    s_movk_i32 s32, 0x800
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; VI-NEXT:    s_getpc_b64 s[4:5]
@@ -5005,7 +4777,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 3
 ; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -5014,7 +4786,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
 ; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
 ; CI-NEXT:    s_movk_i32 s32, 0x800
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_getpc_b64 s[4:5]
@@ -5043,7 +4814,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
@@ -5053,7 +4824,6 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
 ; GFX9-NEXT:    s_movk_i32 s32, 0x800
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
@@ -5080,10 +4850,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
 ; GFX11-NEXT:    s_mov_b32 s32, 32
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
@@ -5091,7 +4860,7 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 16
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:16
 ; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:20
@@ -5109,9 +4878,9 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ;
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    v_mov_b32_e32 v0, 3
 ; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
@@ -5120,17 +4889,16 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
 ; HSA-NEXT:    s_movk_i32 s32, 0x800
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    s_waitcnt vmcnt(1)
 ; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; HSA-NEXT:    v_mov_b32_e32 v0, 16
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:16
 ; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:20
 ; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
@@ -5162,20 +4930,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; VI-LABEL: test_call_external_void_func_v16i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
-; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_add_u32 s36, s36, s1
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; VI-NEXT:    s_addc_u32 s37, s37, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_getpc_b64 s[4:5]
 ; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
@@ -5204,20 +4971,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; CI-LABEL: test_call_external_void_func_v16i8:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
-; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_add_u32 s36, s36, s1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_getpc_b64 s[4:5]
 ; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
@@ -5246,20 +5012,19 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_add_u32 s36, s36, s1
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT:    s_addc_u32 s37, s37, 0
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
@@ -5287,16 +5052,15 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v16i8 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i8 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5314,26 +5078,25 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v17
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v16i8:
 ; HSA:       ; %bb.0:
-; HSA-NEXT:    s_add_i32 s6, s6, s9
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; HSA-NEXT:    s_add_u32 s0, s0, s9
-; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
-; HSA-NEXT:    s_mov_b32 s10, -1
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_i32 s4, s4, s7
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
+; HSA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s7
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
-; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; HSA-NEXT:    s_mov_b32 s32, 0
-; HSA-NEXT:    s_getpc_b64 s[8:9]
-; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i8 at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i8 at rel32@hi+12
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
 ; HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
@@ -5353,7 +5116,7 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; HSA-NEXT:    v_mov_b32_e32 v1, v16
 ; HSA-NEXT:    v_mov_b32_e32 v2, v17
 ; HSA-NEXT:    v_mov_b32_e32 v3, v18
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i8>, ptr addrspace(1) %ptr
@@ -5368,19 +5131,18 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; VI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_mov_b32 s54, -1
 ; VI-NEXT:    s_mov_b32 s55, 0xe80000
-; VI-NEXT:    s_add_u32 s52, s52, s5
-; VI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
-; VI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; VI-NEXT:    s_add_u32 s52, s52, s3
+; VI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xa4
+; VI-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x24
 ; VI-NEXT:    s_mov_b32 s32, 0
 ; VI-NEXT:    s_addc_u32 s53, s53, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s23
+; VI-NEXT:    v_mov_b32_e32 v0, s19
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    buffer_store_dword v1, off, s[52:55], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    s_mov_b64 s[0:1], s[52:53]
 ; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
 ; VI-NEXT:    s_mov_b64 s[2:3], s[54:55]
@@ -5400,25 +5162,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; VI-NEXT:    v_mov_b32_e32 v13, s49
 ; VI-NEXT:    v_mov_b32_e32 v14, s50
 ; VI-NEXT:    v_mov_b32_e32 v15, s51
-; VI-NEXT:    v_mov_b32_e32 v16, s8
-; VI-NEXT:    v_mov_b32_e32 v17, s9
-; VI-NEXT:    v_mov_b32_e32 v18, s10
-; VI-NEXT:    v_mov_b32_e32 v19, s11
-; VI-NEXT:    v_mov_b32_e32 v20, s12
-; VI-NEXT:    v_mov_b32_e32 v21, s13
-; VI-NEXT:    v_mov_b32_e32 v22, s14
-; VI-NEXT:    v_mov_b32_e32 v23, s15
-; VI-NEXT:    v_mov_b32_e32 v24, s16
-; VI-NEXT:    v_mov_b32_e32 v25, s17
-; VI-NEXT:    v_mov_b32_e32 v26, s18
-; VI-NEXT:    v_mov_b32_e32 v27, s19
-; VI-NEXT:    v_mov_b32_e32 v28, s20
-; VI-NEXT:    v_mov_b32_e32 v29, s21
-; VI-NEXT:    v_mov_b32_e32 v30, s22
-; VI-NEXT:    s_getpc_b64 s[4:5]
-; VI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; VI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
-; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v16, s4
+; VI-NEXT:    v_mov_b32_e32 v17, s5
+; VI-NEXT:    v_mov_b32_e32 v18, s6
+; VI-NEXT:    v_mov_b32_e32 v19, s7
+; VI-NEXT:    v_mov_b32_e32 v20, s8
+; VI-NEXT:    v_mov_b32_e32 v21, s9
+; VI-NEXT:    v_mov_b32_e32 v22, s10
+; VI-NEXT:    v_mov_b32_e32 v23, s11
+; VI-NEXT:    v_mov_b32_e32 v24, s12
+; VI-NEXT:    v_mov_b32_e32 v25, s13
+; VI-NEXT:    v_mov_b32_e32 v26, s14
+; VI-NEXT:    v_mov_b32_e32 v27, s15
+; VI-NEXT:    v_mov_b32_e32 v28, s16
+; VI-NEXT:    v_mov_b32_e32 v29, s17
+; VI-NEXT:    v_mov_b32_e32 v30, s18
+; VI-NEXT:    s_getpc_b64 s[20:21]
+; VI-NEXT:    s_add_u32 s20, s20, stack_passed_f64_arg at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s21, s21, stack_passed_f64_arg at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; VI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: stack_passed_arg_alignment_v32i32_f64:
@@ -5427,19 +5189,18 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; CI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_mov_b32 s54, -1
 ; CI-NEXT:    s_mov_b32 s55, 0xe8f000
-; CI-NEXT:    s_add_u32 s52, s52, s5
-; CI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x19
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x29
-; CI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x9
+; CI-NEXT:    s_add_u32 s52, s52, s3
+; CI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x29
+; CI-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_addc_u32 s53, s53, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s23
+; CI-NEXT:    v_mov_b32_e32 v0, s19
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
-; CI-NEXT:    v_mov_b32_e32 v0, s5
-; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    buffer_store_dword v1, off, s[52:55], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    s_mov_b64 s[0:1], s[52:53]
 ; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
 ; CI-NEXT:    s_mov_b64 s[2:3], s[54:55]
@@ -5459,25 +5220,25 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; CI-NEXT:    v_mov_b32_e32 v13, s49
 ; CI-NEXT:    v_mov_b32_e32 v14, s50
 ; CI-NEXT:    v_mov_b32_e32 v15, s51
-; CI-NEXT:    v_mov_b32_e32 v16, s8
-; CI-NEXT:    v_mov_b32_e32 v17, s9
-; CI-NEXT:    v_mov_b32_e32 v18, s10
-; CI-NEXT:    v_mov_b32_e32 v19, s11
-; CI-NEXT:    v_mov_b32_e32 v20, s12
-; CI-NEXT:    v_mov_b32_e32 v21, s13
-; CI-NEXT:    v_mov_b32_e32 v22, s14
-; CI-NEXT:    v_mov_b32_e32 v23, s15
-; CI-NEXT:    v_mov_b32_e32 v24, s16
-; CI-NEXT:    v_mov_b32_e32 v25, s17
-; CI-NEXT:    v_mov_b32_e32 v26, s18
-; CI-NEXT:    v_mov_b32_e32 v27, s19
-; CI-NEXT:    v_mov_b32_e32 v28, s20
-; CI-NEXT:    v_mov_b32_e32 v29, s21
-; CI-NEXT:    v_mov_b32_e32 v30, s22
-; CI-NEXT:    s_getpc_b64 s[4:5]
-; CI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; CI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
-; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_mov_b32_e32 v16, s4
+; CI-NEXT:    v_mov_b32_e32 v17, s5
+; CI-NEXT:    v_mov_b32_e32 v18, s6
+; CI-NEXT:    v_mov_b32_e32 v19, s7
+; CI-NEXT:    v_mov_b32_e32 v20, s8
+; CI-NEXT:    v_mov_b32_e32 v21, s9
+; CI-NEXT:    v_mov_b32_e32 v22, s10
+; CI-NEXT:    v_mov_b32_e32 v23, s11
+; CI-NEXT:    v_mov_b32_e32 v24, s12
+; CI-NEXT:    v_mov_b32_e32 v25, s13
+; CI-NEXT:    v_mov_b32_e32 v26, s14
+; CI-NEXT:    v_mov_b32_e32 v27, s15
+; CI-NEXT:    v_mov_b32_e32 v28, s16
+; CI-NEXT:    v_mov_b32_e32 v29, s17
+; CI-NEXT:    v_mov_b32_e32 v30, s18
+; CI-NEXT:    s_getpc_b64 s[20:21]
+; CI-NEXT:    s_add_u32 s20, s20, stack_passed_f64_arg at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s21, s21, stack_passed_f64_arg at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; CI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
@@ -5486,19 +5247,18 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX9-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s54, -1
 ; GFX9-NEXT:    s_mov_b32 s55, 0xe00000
-; GFX9-NEXT:    s_add_u32 s52, s52, s5
-; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; GFX9-NEXT:    s_add_u32 s52, s52, s3
+; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xa4
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_addc_u32 s53, s53, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s23
+; GFX9-NEXT:    v_mov_b32_e32 v0, s19
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    buffer_store_dword v1, off, s[52:55], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[52:53]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[54:55]
@@ -5518,43 +5278,43 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX9-NEXT:    v_mov_b32_e32 v13, s49
 ; GFX9-NEXT:    v_mov_b32_e32 v14, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s51
-; GFX9-NEXT:    v_mov_b32_e32 v16, s8
-; GFX9-NEXT:    v_mov_b32_e32 v17, s9
-; GFX9-NEXT:    v_mov_b32_e32 v18, s10
-; GFX9-NEXT:    v_mov_b32_e32 v19, s11
-; GFX9-NEXT:    v_mov_b32_e32 v20, s12
-; GFX9-NEXT:    v_mov_b32_e32 v21, s13
-; GFX9-NEXT:    v_mov_b32_e32 v22, s14
-; GFX9-NEXT:    v_mov_b32_e32 v23, s15
-; GFX9-NEXT:    v_mov_b32_e32 v24, s16
-; GFX9-NEXT:    v_mov_b32_e32 v25, s17
-; GFX9-NEXT:    v_mov_b32_e32 v26, s18
-; GFX9-NEXT:    v_mov_b32_e32 v27, s19
-; GFX9-NEXT:    v_mov_b32_e32 v28, s20
-; GFX9-NEXT:    v_mov_b32_e32 v29, s21
-; GFX9-NEXT:    v_mov_b32_e32 v30, s22
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v16, s4
+; GFX9-NEXT:    v_mov_b32_e32 v17, s5
+; GFX9-NEXT:    v_mov_b32_e32 v18, s6
+; GFX9-NEXT:    v_mov_b32_e32 v19, s7
+; GFX9-NEXT:    v_mov_b32_e32 v20, s8
+; GFX9-NEXT:    v_mov_b32_e32 v21, s9
+; GFX9-NEXT:    v_mov_b32_e32 v22, s10
+; GFX9-NEXT:    v_mov_b32_e32 v23, s11
+; GFX9-NEXT:    v_mov_b32_e32 v24, s12
+; GFX9-NEXT:    v_mov_b32_e32 v25, s13
+; GFX9-NEXT:    v_mov_b32_e32 v26, s14
+; GFX9-NEXT:    v_mov_b32_e32 v27, s15
+; GFX9-NEXT:    v_mov_b32_e32 v28, s16
+; GFX9-NEXT:    v_mov_b32_e32 v29, s17
+; GFX9-NEXT:    v_mov_b32_e32 v30, s18
+; GFX9-NEXT:    s_getpc_b64 s[20:21]
+; GFX9-NEXT:    s_add_u32 s20, s20, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s21, s21, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b64 s[20:21], s[2:3], 0xa4
-; GFX11-NEXT:    s_load_b512 s[4:19], s[2:3], 0x64
-; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x24
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0xa4
+; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x64
+; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x24
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s22, s32, 8
+; GFX11-NEXT:    s_add_i32 s20, s32, 8
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20
+; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s19
-; GFX11-NEXT:    s_add_i32 s19, s32, 4
+; GFX11-NEXT:    s_add_i32 s2, s32, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT:    scratch_store_b32 off, v0, s22
-; GFX11-NEXT:    scratch_store_b32 off, v1, s19
+; GFX11-NEXT:    scratch_store_b32 off, v0, s20
+; GFX11-NEXT:    scratch_store_b32 off, v1, s2
 ; GFX11-NEXT:    scratch_store_b32 off, v2, s32
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38
@@ -5571,33 +5331,31 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX11-NEXT:    v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14
 ; GFX11-NEXT:    v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v30, s18
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_add_i32 s8, s8, s11
-; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
-; HSA-NEXT:    s_add_u32 s0, s0, s11
-; HSA-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x40
-; HSA-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x80
-; HSA-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; HSA-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x80
+; HSA-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; HSA-NEXT:    s_mov_b32 s32, 0
 ; HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-NEXT:    v_mov_b32_e32 v0, s23
+; HSA-NEXT:    v_mov_b32_e32 v1, s6
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; HSA-NEXT:    v_mov_b32_e32 v0, s24
-; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
-; HSA-NEXT:    v_mov_b32_e32 v0, s25
+; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, s7
 ; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
-; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; HSA-NEXT:    v_mov_b32_e32 v0, s36
 ; HSA-NEXT:    v_mov_b32_e32 v1, s37
 ; HSA-NEXT:    v_mov_b32_e32 v2, s38
@@ -5629,10 +5387,10 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; HSA-NEXT:    v_mov_b32_e32 v28, s20
 ; HSA-NEXT:    v_mov_b32_e32 v29, s21
 ; HSA-NEXT:    v_mov_b32_e32 v30, s22
-; HSA-NEXT:    s_getpc_b64 s[24:25]
-; HSA-NEXT:    s_add_u32 s24, s24, stack_passed_f64_arg at rel32@lo+4
-; HSA-NEXT:    s_addc_u32 s25, s25, stack_passed_f64_arg at rel32@hi+12
-; HSA-NEXT:    s_swappc_b64 s[30:31], s[24:25]
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    s_endpgm
 entry:
   call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
@@ -7258,3 +7016,6 @@ declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x f
 attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index c0ba343d1dd35..616e5f00fc1e5 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -5,20 +5,19 @@
 define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN-LABEL: call_memory_arg_load:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func at rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, ptr addrspace(3) %ptr
   call void @func(i32 %vgpr)
@@ -29,21 +28,20 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_memory_no_dep:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_store_dword v0, v0, s[6:7]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    global_store_dword v0, v0, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func at rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   store i32 0, ptr addrspace(1) %ptr
   call void @func(i32 0)
@@ -54,19 +52,18 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func at rel32@hi+12
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, func at rel32@hi+12
 ; GCN-NEXT:    v_mov_b32_e32 v40, 0
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    global_store_dword v40, v40, s[34:35]
 ; GCN-NEXT:    s_endpgm
   call void @func(i32 0)
@@ -77,19 +74,18 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #
 define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_no_wait_after_call_return_val:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, func.return at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, func.return at rel32@hi+12
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, func.return at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, func.return at rel32@hi+12
 ; GCN-NEXT:    v_mov_b32_e32 v40, 0
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    global_store_dword v40, v0, s[34:35]
 ; GCN-NEXT:    s_endpgm
   %rv = call i32 @func.return(i32 0)
@@ -101,19 +97,18 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
 define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: call_got_load:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s11
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_add_u32 s0, s0, s9
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, got.func at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, got.func at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
   call void @got.func(i32 0)
   ret void
@@ -155,3 +150,6 @@ declare hidden i32 @func.return(i32) #0
 declare void @got.func(i32) #0
 
 attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 9eed9b5bb26c8..e1828042752ea 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -28,15 +28,15 @@ define hidden void @use_queue_ptr() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
-; GCN: s_mov_b64 s[6:7], s[4:5]
-; GCN: .amdhsa_user_sgpr_queue_ptr 1
+; GCN: s_swappc_b64 s[30:31], s[4:5]
+; GCN: .amdhsa_user_sgpr_queue_ptr 0
 define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
   call void @use_queue_ptr()
   ret void
 }
 
 ; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
-; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
+; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x0
 ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
 ; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
 
@@ -52,8 +52,8 @@ define hidden void @use_queue_ptr_addrspacecast() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
-; CIVI: s_mov_b64 s[6:7], s[4:5]
-; CIVI: .amdhsa_user_sgpr_queue_ptr 1
+; CIVI: s_swappc_b64 s[30:31], s[4:5]
+; CIVI: .amdhsa_user_sgpr_queue_ptr 0
 
 ; GFX9-NOT: s_mov_b64 s[6:7]
 ; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -463,15 +463,12 @@ define hidden void @use_every_sgpr_input() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
-; GCN: s_mov_b32 s13, s15
-; GCN: s_mov_b32 s12, s14
-; GCN: s_mov_b32 s14, s16
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
 ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
 ; GCN: .amdhsa_user_sgpr_dispatch_ptr 1
-; GCN: .amdhsa_user_sgpr_queue_ptr 1
+; GCN: .amdhsa_user_sgpr_queue_ptr 0
 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; GCN: .amdhsa_user_sgpr_dispatch_id 1
 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
@@ -490,14 +487,13 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
 ; We have to pass the kernarg segment, but there are no kernel
 ; arguments so null is passed.
 ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input_no_kernargs:
-; GCN: s_mov_b64 s[10:11], s[8:9]
-; GCN: s_mov_b64 s[8:9], 0{{$}}
+; GCN: s_mov_b64 s[10:11], s[6:7]
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
 ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
 ; GCN: .amdhsa_user_sgpr_dispatch_ptr 1
-; GCN: .amdhsa_user_sgpr_queue_ptr 1
+; GCN: .amdhsa_user_sgpr_queue_ptr 0
 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0
 ; GCN: .amdhsa_user_sgpr_dispatch_id 1
 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
@@ -584,3 +580,6 @@ declare noalias ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind noinline }
 attributes #2 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll
index 8a69069d75e2c..ca09163b20afc 100644
--- a/llvm/test/CodeGen/AMDGPU/cc-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll
@@ -67,80 +67,76 @@ entry:
 define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_add_i32 s12, s12, s17
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s17
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    s_mov_b32 s13, s15
-; GFX803-NEXT:    s_mov_b32 s12, s14
+; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT:    s_mov_b32 s14, s16
+; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_mov_b32 s32, 0
-; GFX803-NEXT:    s_getpc_b64 s[18:19]
-; GFX803-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT:    s_getpc_b64 s[16:17]
+; GFX803-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s17
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_getpc_b64 s[18:19]
-; GFX900-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_kern_call:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_add_u32 s12, s12, s17
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
 ; GFX1010-NEXT:    s_mov_b32 s32, 0
-; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s17
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b32 s13, s15
-; GFX1010-NEXT:    s_mov_b32 s12, s14
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT:    s_mov_b32 s14, s16
-; GFX1010-NEXT:    s_getpc_b64 s[18:19]
-; GFX1010-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX1010-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX1010-NEXT:    s_getpc_b64 s[16:17]
+; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_kern_call:
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1100-NEXT:    s_mov_b32 s12, s13
-; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1100-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 0
-; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1100-NEXT:    s_getpc_b64 s[6:7]
+; GFX1100-NEXT:    s_add_u32 s6, s6, ex at rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s7, s7, ex at rel32@hi+12
+; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1100-NEXT:    s_endpgm
 
 entry:
@@ -151,72 +147,69 @@ entry:
 define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX803-LABEL: test_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_add_i32 s12, s12, s17
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s17
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    s_mov_b32 s13, s15
-; GFX803-NEXT:    s_mov_b32 s12, s14
+; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT:    s_mov_b32 s14, s16
+; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    s_getpc_b64 s[18:19]
-; GFX803-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT:    s_getpc_b64 s[16:17]
+; GFX803-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s17
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_getpc_b64 s[18:19]
-; GFX900-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_kern_stack_and_call:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_add_u32 s12, s12, s17
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
 ; GFX1010-NEXT:    s_movk_i32 s32, 0x200
-; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s17
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b32 s13, s15
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT:    s_mov_b32 s12, s14
-; GFX1010-NEXT:    s_mov_b32 s14, s16
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-NEXT:    s_getpc_b64 s[18:19]
-; GFX1010-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX1010-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX1010-NEXT:    s_getpc_b64 s[16:17]
+; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_kern_stack_and_call:
@@ -224,19 +217,18 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1100-NEXT:    s_mov_b32 s12, s13
-; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1100-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
 ; GFX1100-NEXT:    scratch_store_b32 off, v1, off offset:4 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1100-NEXT:    s_getpc_b64 s[6:7]
+; GFX1100-NEXT:    s_add_u32 s6, s6, ex at rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s7, s7, ex at rel32@hi+12
+; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1100-NEXT:    s_endpgm
 
 entry:
@@ -318,84 +310,80 @@ entry:
 define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_call:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_add_i32 s12, s12, s17
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s17
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    s_mov_b32 s13, s15
-; GFX803-NEXT:    s_mov_b32 s12, s14
+; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT:    s_mov_b32 s14, s16
+; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_mov_b32 s32, 0
 ; GFX803-NEXT:    s_mov_b32 s33, 0
-; GFX803-NEXT:    s_getpc_b64 s[18:19]
-; GFX803-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT:    s_getpc_b64 s[16:17]
+; GFX803-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_force_fp_kern_call:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s17
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_mov_b32 s32, 0
 ; GFX900-NEXT:    s_mov_b32 s33, 0
-; GFX900-NEXT:    s_getpc_b64 s[18:19]
-; GFX900-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_force_fp_kern_call:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_add_u32 s12, s12, s17
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
 ; GFX1010-NEXT:    s_mov_b32 s32, 0
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
-; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1010-NEXT:    s_add_u32 s0, s0, s17
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b32 s13, s15
-; GFX1010-NEXT:    s_mov_b32 s12, s14
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT:    s_mov_b32 s14, s16
-; GFX1010-NEXT:    s_getpc_b64 s[18:19]
-; GFX1010-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX1010-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX1010-NEXT:    s_getpc_b64 s[16:17]
+; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_force_fp_kern_call:
 ; GFX1100:       ; %bb.0: ; %entry
 ; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1100-NEXT:    s_mov_b32 s12, s13
-; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1100-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
-; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1100-NEXT:    s_getpc_b64 s[6:7]
+; GFX1100-NEXT:    s_add_u32 s6, s6, ex at rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s7, s7, ex at rel32@hi+12
+; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1100-NEXT:    s_endpgm
 ; GFX1010-NEXT    s_add_u32 s12, s12, s17
 ; GFX1010-NEXT    s_mov_b32 s32, 0
@@ -424,75 +412,72 @@ entry:
 define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
 ; GFX803-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_add_i32 s12, s12, s17
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX803-NEXT:    s_add_u32 s0, s0, s17
+; GFX803-NEXT:    s_add_u32 s0, s0, s15
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    s_mov_b32 s33, 0
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX803-NEXT:    s_addc_u32 s1, s1, 0
-; GFX803-NEXT:    s_mov_b32 s13, s15
-; GFX803-NEXT:    s_mov_b32 s12, s14
+; GFX803-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
-; GFX803-NEXT:    s_mov_b32 s14, s16
+; GFX803-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX803-NEXT:    s_movk_i32 s32, 0x400
 ; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
-; GFX803-NEXT:    s_getpc_b64 s[18:19]
-; GFX803-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX803-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX803-NEXT:    s_getpc_b64 s[16:17]
+; GFX803-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX803-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX803-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX803-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GFX900-NEXT:    s_add_u32 s0, s0, s17
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX900-NEXT:    s_add_u32 s0, s0, s15
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX900-NEXT:    s_mov_b32 s33, 0
 ; GFX900-NEXT:    s_addc_u32 s1, s1, 0
-; GFX900-NEXT:    s_mov_b32 s13, s15
-; GFX900-NEXT:    s_mov_b32 s12, s14
+; GFX900-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX900-NEXT:    s_mov_b32 s14, s16
+; GFX900-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX900-NEXT:    s_movk_i32 s32, 0x400
 ; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_getpc_b64 s[18:19]
-; GFX900-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX900-NEXT:    s_getpc_b64 s[16:17]
+; GFX900-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX900-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX900-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
 ; GFX1010:       ; %bb.0: ; %entry
-; GFX1010-NEXT:    s_add_u32 s12, s12, s17
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
 ; GFX1010-NEXT:    s_movk_i32 s32, 0x200
 ; GFX1010-NEXT:    s_mov_b32 s33, 0
-; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1010-NEXT:    s_add_u32 s0, s0, s17
+; GFX1010-NEXT:    s_add_u32 s0, s0, s15
 ; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
-; GFX1010-NEXT:    s_mov_b32 s13, s15
+; GFX1010-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GFX1010-NEXT:    s_mov_b32 s12, s14
-; GFX1010-NEXT:    s_mov_b32 s14, s16
+; GFX1010-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
 ; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-NEXT:    s_getpc_b64 s[18:19]
-; GFX1010-NEXT:    s_add_u32 s18, s18, ex at rel32@lo+4
-; GFX1010-NEXT:    s_addc_u32 s19, s19, ex at rel32@hi+12
-; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GFX1010-NEXT:    s_getpc_b64 s[16:17]
+; GFX1010-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
+; GFX1010-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
+; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1010-NEXT:    s_endpgm
 ;
 ; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
@@ -501,19 +486,18 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add
 ; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1100-NEXT:    s_mov_b32 s33, 0
 ; GFX1100-NEXT:    s_mov_b32 s12, s13
-; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX1100-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1100-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX1100-NEXT:    s_mov_b32 s13, s14
 ; GFX1100-NEXT:    s_mov_b32 s14, s15
 ; GFX1100-NEXT:    s_mov_b32 s32, 16
 ; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 offset:4 dlc
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1100-NEXT:    s_getpc_b64 s[16:17]
-; GFX1100-NEXT:    s_add_u32 s16, s16, ex at rel32@lo+4
-; GFX1100-NEXT:    s_addc_u32 s17, s17, ex at rel32@hi+12
-; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1100-NEXT:    s_getpc_b64 s[6:7]
+; GFX1100-NEXT:    s_add_u32 s6, s6, ex at rel32@lo+4
+; GFX1100-NEXT:    s_addc_u32 s7, s7, ex at rel32@hi+12
+; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX1100-NEXT:    s_endpgm
 entry:
   %x = alloca i32, align 4, addrspace(5)
@@ -609,3 +593,6 @@ declare hidden void @ex() local_unnamed_addr #0
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
 attributes #2 = { nounwind "frame-pointer"="all" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index c6b17b40ffb6a..362427f5835a9 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -34,36 +34,36 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN_DBG-NEXT:    s_mov_b32 s14, -1
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
+; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN_DBG-NEXT:    s_load_dword s1, s[4:5], 0xa
+; GCN_DBG-NEXT:    s_load_dword s1, s[2:3], 0xa
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s2, -1
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    s_cmp_lg_u32 s1, s2
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_mov_b64 s[6:7], exec
+; GCN_DBG-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN_DBG-NEXT:    s_mov_b64 exec, -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB0_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
@@ -86,15 +86,15 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 entry:
@@ -142,31 +142,31 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN_DBG-NEXT:    s_mov_b32 s14, -1
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
+; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 ; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB1_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
@@ -189,9 +189,9 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], 0
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GCN_DBG-NEXT:    s_branch .LBB1_2
 entry:
@@ -230,31 +230,31 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN_DBG-NEXT:    s_mov_b32 s14, -1
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
+; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 ; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB2_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
@@ -277,9 +277,9 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
 ; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GCN_DBG-NEXT:    s_branch .LBB2_2
 entry:
@@ -319,31 +319,31 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN_DBG-NEXT:    s_mov_b32 s14, -1
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
+; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    s_mov_b32 s0, 0
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 ; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    ; kill: killed $vgpr0
 ; GCN_DBG-NEXT:    s_endpgm
 ; GCN_DBG-NEXT:  .LBB3_2: ; %for.body
 ; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    s_waitcnt expcnt(0)
 ; GCN_DBG-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_waitcnt vmcnt(0)
 ; GCN_DBG-NEXT:    v_readlane_b32 s0, v0, 1
 ; GCN_DBG-NEXT:    v_readlane_b32 s2, v0, 0
@@ -364,9 +364,9 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
 ; GCN_DBG-NEXT:    s_mov_b32 s1, 1
 ; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN_DBG-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN_DBG-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN_DBG-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN_DBG-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GCN_DBG-NEXT:    s_branch .LBB3_2
 entry:
@@ -420,10 +420,10 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
 ; GCN_DBG-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN_DBG-NEXT:    s_mov_b32 s14, -1
 ; GCN_DBG-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN_DBG-NEXT:    s_add_u32 s12, s12, s11
+; GCN_DBG-NEXT:    s_add_u32 s12, s12, s9
 ; GCN_DBG-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN_DBG-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GCN_DBG-NEXT:    s_load_dword s0, s[4:5], 0x9
+; GCN_DBG-NEXT:    s_load_dword s0, s[2:3], 0x9
 ; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN_DBG-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN_DBG-NEXT:    v_mov_b32_e32 v1, 0
@@ -502,3 +502,6 @@ for.body:
   %inc = add i32 %indvar, 1
   br i1 %cond, label %for.body, label %for.exit
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 8b1c5c3ae071c..b2098344f93fd 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -46,14 +46,14 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-O0-NEXT:    s_mov_b32 s14, -1
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
+; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
@@ -220,14 +220,14 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-O0-NEXT:    s_mov_b32 s14, -1
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
+; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
@@ -431,14 +431,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-O0-NEXT:    s_mov_b32 s14, -1
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
+; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -679,14 +679,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-O0-NEXT:    s_mov_b32 s14, -1
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
+; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x9
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
@@ -933,14 +933,14 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-O0-NEXT:    s_mov_b32 s14, -1
 ; GCN-O0-NEXT:    s_mov_b32 s15, 0xe8f000
-; GCN-O0-NEXT:    s_add_u32 s12, s12, s11
+; GCN-O0-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-O0-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-O0-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s0, 0
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s1, 1
@@ -1437,3 +1437,6 @@ declare void @llvm.amdgcn.s.barrier() #1
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind convergent }
 attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index f97d15739be29..11871db1ef656 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -176,30 +176,28 @@ bb1:
 define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
 ; GCN-LABEL: v3i16_registers:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT:    s_load_dword s12, s[8:9], 0x0
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %if.else
-; GCN-NEXT:    s_add_u32 s8, s8, 8
+; GCN-NEXT:    s_add_u32 s8, s6, 8
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_addc_u32 s9, s7, 0
 ; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT:    s_mov_b32 s12, s14
-; GCN-NEXT:    s_mov_b32 s13, s15
-; GCN-NEXT:    s_mov_b32 s14, s16
-; GCN-NEXT:    s_getpc_b64 s[18:19]
-; GCN-NEXT:    s_add_u32 s18, s18, func_v3i16 at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s19, s19, func_v3i16 at rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v3i16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v3i16 at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_branch .LBB4_3
 ; GCN-NEXT:  .LBB4_2:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -227,30 +225,28 @@ if.end:                                           ; preds = %if.else, %if.then
 define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
 ; GCN-LABEL: v3f16_registers:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GCN-NEXT:    s_load_dword s12, s[8:9], 0x0
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s17
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GCN-NEXT:    s_load_dword s8, s[6:7], 0x0
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s12, 0
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-NEXT:    s_bitcmp1_b32 s8, 0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_vccnz .LBB5_2
 ; GCN-NEXT:  ; %bb.1: ; %if.else
-; GCN-NEXT:    s_add_u32 s8, s8, 8
+; GCN-NEXT:    s_add_u32 s8, s6, 8
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_addc_u32 s9, s7, 0
 ; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
-; GCN-NEXT:    s_mov_b32 s12, s14
-; GCN-NEXT:    s_mov_b32 s13, s15
-; GCN-NEXT:    s_mov_b32 s14, s16
-; GCN-NEXT:    s_getpc_b64 s[18:19]
-; GCN-NEXT:    s_add_u32 s18, s18, func_v3f16 at rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s19, s19, func_v3f16 at rel32@hi+12
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, func_v3f16 at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, func_v3f16 at rel32@hi+12
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_branch .LBB5_3
 ; GCN-NEXT:  .LBB5_2:
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -285,3 +281,6 @@ declare hidden <3 x half> @func_v3f16()
 declare hidden { <4 x i32>, <4 x half> } @func_struct() #0
 
 attributes #0 = { nounwind}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 901cbd4a52720..7b0ad8625b45f 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -944,7 +944,7 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add
 ;
 ; GFX9-LABEL: load_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -1026,7 +1026,7 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: load_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: load_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1209,7 +1209,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1334,7 +1334,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1497,7 +1497,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1672,13 +1672,13 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ;
 ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_movk_i32 s4, 0x900
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_movk_i32 s4, 0x900
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
@@ -1867,7 +1867,7 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: load_v7i8_to_v7f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2008,7 +2008,7 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: load_v8i8_to_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2109,7 +2109,7 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2194,7 +2194,7 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2275,7 +2275,7 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr
 ;
 ; GFX9-LABEL: i8_zext_i32_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
@@ -2389,7 +2389,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou
 ;
 ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2489,7 +2489,7 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-LABEL: extract_byte0_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2570,7 +2570,7 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-LABEL: extract_byte1_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2652,7 +2652,7 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-LABEL: extract_byte2_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2734,7 +2734,7 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p
 ;
 ; GFX9-LABEL: extract_byte3_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2823,7 +2823,7 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX9-LABEL: cvt_ubyte0_or_multiuse:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3019,3 +3019,6 @@ for.body.i:                                       ; preds = %for.body.i, %entry
   store i32 %retval.sroa.0.0.insert.insert, ptr addrspace(1) undef, align 1
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9d94f8e6ca227..1b0ffb97ecb1a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1334,31 +1334,29 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_getpc_b64 s[40:41]
 ; CI-NEXT:    s_mov_b32 s40, s0
 ; CI-NEXT:    s_load_dwordx4 s[40:43], s[40:41], 0x0
-; CI-NEXT:    s_mov_b32 s14, s10
+; CI-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; CI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x0
+; CI-NEXT:    s_load_dword s4, s[2:3], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_mov_b32 s12, s8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s40, s40, s11
-; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; CI-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
-; CI-NEXT:    s_load_dword s6, s[4:5], 0x2
+; CI-NEXT:    s_add_u32 s40, s40, s9
 ; CI-NEXT:    s_addc_u32 s41, s41, 0
-; CI-NEXT:    s_add_u32 s8, s4, 12
-; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CI-NEXT:    s_mov_b32 s13, s9
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_add_i32_e32 v40, vcc, s6, v3
+; CI-NEXT:    v_add_i32_e32 v40, vcc, s4, v3
 ; CI-NEXT:    ds_read_b32 v41, v40
-; CI-NEXT:    s_addc_u32 s9, s5, 0
+; CI-NEXT:    s_mov_b32 s14, s8
+; CI-NEXT:    s_add_u32 s8, s2, 12
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CI-NEXT:    s_addc_u32 s9, s3, 0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; CI-NEXT:    s_mov_b32 s17, void_func_void at abs32@hi
 ; CI-NEXT:    s_mov_b32 s16, void_func_void at abs32@lo
 ; CI-NEXT:    v_or_b32_e32 v31, v0, v2
+; CI-NEXT:    s_mov_b32 s12, s6
+; CI-NEXT:    s_mov_b32 s13, s7
 ; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_mov_b32 s39, 0xf000
@@ -1375,28 +1373,26 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-NEXT:    s_mov_b32 s36, s0
 ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x8
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
+; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s4
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_add_u32 s8, s4, 12
-; GFX9-NEXT:    s_addc_u32 s9, s5, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s6
 ; GFX9-NEXT:    ds_read_b32 v42, v41
+; GFX9-NEXT:    s_add_u32 s8, s2, 12
+; GFX9-NEXT:    s_addc_u32 s9, s3, 0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    s_mov_b32 s17, void_func_void at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s16, void_func_void at abs32@lo
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v40, 0
@@ -1544,3 +1540,6 @@ attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { convergent nounwind }
 attributes #3 = { nounwind noinline }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index f4d985833eb9a..36a7ed51227a6 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -50,19 +50,19 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
 ; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[46:47]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[36:37]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s12, s45
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
@@ -110,7 +110,7 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
 
 !llvm.dbg.cu = !{!0, !25, !26}
-!llvm.module.flags = !{!27, !28, !29, !30, !31, !32}
+!llvm.module.flags = !{!27, !28, !29, !30, !31, !32, !44}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 05256c8d95e0b15bcc502d595c15d902ff520f97)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !8, imports: !20, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "dummy", directory: "dummy", checksumkind: CSK_MD5, checksum: "b67bec84bdce3730b4a6f2ed8d50b85c")
@@ -155,3 +155,4 @@ attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willre
 !41 = !DILocalVariable(name: "dummy", arg: 3, scope: !34, file: !1, line: 49, type: !5)
 !42 = !DILocalVariable(name: "dummy", arg: 4, scope: !34, file: !1, line: 49, type: !5)
 !43 = !DILocation(line: 49, column: 9, scope: !34)
+!44 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index b3cca5ff2429b..bcf6bda807c8f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -62,45 +62,43 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 define amdgpu_kernel void @kernel_calls_no_stack() {
 ; FLAT_SCR_OPT-LABEL: kernel_calls_no_stack:
 ; FLAT_SCR_OPT:       ; %bb.0:
-; FLAT_SCR_OPT-NEXT:    s_add_u32 s8, s8, s13
+; FLAT_SCR_OPT-NEXT:    s_add_u32 s6, s6, s11
 ; FLAT_SCR_OPT-NEXT:    s_mov_b32 s32, 0
-; FLAT_SCR_OPT-NEXT:    s_addc_u32 s9, s9, 0
-; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
-; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
-; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; FLAT_SCR_OPT-NEXT:    s_addc_u32 s7, s7, 0
+; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
+; FLAT_SCR_OPT-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s14, s10
+; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; FLAT_SCR_OPT-NEXT:    s_getpc_b64 s[4:5]
 ; FLAT_SCR_OPT-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; FLAT_SCR_OPT-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; FLAT_SCR_OPT-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; FLAT_SCR_OPT-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; FLAT_SCR_OPT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; FLAT_SCR_OPT-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s14, s12
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s13, s11
-; FLAT_SCR_OPT-NEXT:    s_mov_b32 s12, s10
-; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; FLAT_SCR_OPT-NEXT:    v_or3_b32 v31, v0, v1, v2
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s13, s9
+; FLAT_SCR_OPT-NEXT:    s_mov_b32 s12, s8
 ; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; FLAT_SCR_OPT-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; FLAT_SCR_OPT-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; FLAT_SCR_OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT_SCR_OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; FLAT_SCR_OPT-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; FLAT_SCR_OPT-NEXT:    s_endpgm
 ;
 ; FLAT_SCR_ARCH-LABEL: kernel_calls_no_stack:
 ; FLAT_SCR_ARCH:       ; %bb.0:
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s13, s9
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s12, s8
-; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; FLAT_SCR_ARCH-NEXT:    s_getpc_b64 s[4:5]
 ; FLAT_SCR_ARCH-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; FLAT_SCR_ARCH-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; FLAT_SCR_ARCH-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; FLAT_SCR_ARCH-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
 ; FLAT_SCR_ARCH-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s14, s10
-; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s14, s8
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; FLAT_SCR_ARCH-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s12, s6
 ; FLAT_SCR_ARCH-NEXT:    v_or3_b32 v31, v0, v1, v2
+; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s13, s7
 ; FLAT_SCR_ARCH-NEXT:    s_mov_b32 s32, 0
 ; FLAT_SCR_ARCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLAT_SCR_ARCH-NEXT:    s_swappc_b64 s[30:31], s[16:17]
@@ -418,3 +416,6 @@ define amdgpu_kernel void @kernel_no_calls_no_stack() {
 }
 
 attributes #0 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 4cbd5e84871cc..064831f098667 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -340,27 +340,26 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -390,25 +389,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -460,25 +458,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -530,25 +527,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -592,23 +588,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -646,23 +641,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -700,25 +694,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -785,25 +778,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -868,25 +860,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -941,23 +932,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1013,23 +1003,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -1546,27 +1535,26 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1596,25 +1584,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1666,25 +1653,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1736,25 +1722,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1798,23 +1783,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1863,23 +1847,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1928,25 +1911,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2013,25 +1995,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2096,25 +2077,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2169,23 +2149,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -2252,23 +2231,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2796,27 +2774,26 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2846,25 +2823,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2916,25 +2892,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2986,25 +2961,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3048,23 +3022,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3102,23 +3075,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3156,25 +3128,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3241,25 +3212,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3324,25 +3294,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3397,23 +3366,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3469,23 +3437,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3544,27 +3511,26 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3594,25 +3560,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3664,25 +3629,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3734,25 +3698,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3796,23 +3759,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
@@ -3850,23 +3812,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB6_1: ; %ComputeLoop
@@ -3904,25 +3865,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3989,25 +3949,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4072,25 +4031,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4145,23 +4103,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -4217,23 +4174,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -4749,27 +4705,26 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4799,25 +4754,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4869,25 +4823,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4939,25 +4892,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5001,23 +4953,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
@@ -5066,23 +5017,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB8_1: ; %ComputeLoop
@@ -5131,25 +5081,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5216,25 +5165,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5299,25 +5247,24 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5372,23 +5319,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5455,23 +5401,22 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5535,3 +5480,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #2 = { strictfp}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index e07514b063ee4..66c89de1789ee 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -357,27 +357,26 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -409,25 +408,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -483,25 +481,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -557,25 +554,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -623,23 +619,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -693,23 +688,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -762,25 +756,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -856,25 +849,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -949,25 +941,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1030,23 +1021,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1126,23 +1116,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -1556,27 +1545,26 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1608,25 +1596,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1682,25 +1669,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1756,25 +1742,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1822,23 +1807,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1892,23 +1876,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1961,25 +1944,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2055,25 +2037,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2148,25 +2129,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2229,23 +2209,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -2325,23 +2304,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2755,27 +2733,26 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2807,25 +2784,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2881,25 +2857,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2955,25 +2930,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3021,23 +2995,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3091,23 +3064,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3160,25 +3132,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3254,25 +3225,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3347,25 +3317,24 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3428,23 +3397,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3524,23 +3492,22 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3611,3 +3578,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 }
 
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index ad6edbd2c37a0..17533e22ce2af 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -357,27 +357,26 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -409,25 +408,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -483,25 +481,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -557,25 +554,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -623,23 +619,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -693,23 +688,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -762,25 +756,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -856,25 +849,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -949,25 +941,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1030,23 +1021,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1126,23 +1116,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -1556,27 +1545,26 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1608,25 +1596,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1682,25 +1669,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1756,25 +1742,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1822,23 +1807,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1892,23 +1876,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1961,25 +1944,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2055,25 +2037,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2148,25 +2129,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2229,23 +2209,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -2325,23 +2304,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2755,27 +2733,26 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2807,25 +2784,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2881,25 +2857,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2955,25 +2930,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3021,23 +2995,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3091,23 +3064,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3160,25 +3132,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3254,25 +3225,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3347,25 +3317,24 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3428,23 +3397,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3524,23 +3492,22 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_div_value_defalut_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3611,3 +3578,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 }
 
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 4a00d7bc71bca..a8b83edfa7438 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -392,27 +392,26 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -442,25 +441,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -512,25 +510,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -582,25 +579,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -644,23 +640,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -709,23 +704,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB1_1: ; %ComputeLoop
@@ -774,25 +768,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -859,25 +852,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -942,25 +934,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1015,23 +1006,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -1098,23 +1088,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -1642,27 +1631,26 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1692,25 +1680,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1762,25 +1749,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1832,25 +1818,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1894,23 +1879,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -1959,23 +1943,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
@@ -2024,25 +2007,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2109,25 +2091,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2192,25 +2173,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2265,23 +2245,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -2348,23 +2327,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -2892,27 +2870,26 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2942,25 +2919,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3012,25 +2988,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3082,25 +3057,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3144,23 +3118,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3209,23 +3182,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB5_1: ; %ComputeLoop
@@ -3274,25 +3246,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3359,25 +3330,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3442,25 +3412,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3515,23 +3484,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -3598,23 +3566,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -3684,27 +3651,26 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3734,25 +3700,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3804,25 +3769,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3874,25 +3838,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3936,23 +3899,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
@@ -4001,23 +3963,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB6_1: ; %ComputeLoop
@@ -4066,25 +4027,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4151,25 +4111,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4234,25 +4193,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4307,23 +4265,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -4390,23 +4347,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -4933,27 +4889,26 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX7LESS-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
 ; GFX7LESS-NEXT:    s_mov_b32 s42, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s43, 0xe8f000
-; GFX7LESS-NEXT:    s_add_u32 s40, s40, s11
+; GFX7LESS-NEXT:    s_add_u32 s40, s40, s9
 ; GFX7LESS-NEXT:    s_addc_u32 s41, s41, 0
-; GFX7LESS-NEXT:    s_mov_b32 s14, s10
-; GFX7LESS-NEXT:    s_mov_b32 s13, s9
-; GFX7LESS-NEXT:    s_mov_b32 s12, s8
-; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s14, s8
+; GFX7LESS-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
 ; GFX7LESS-NEXT:    s_mov_b32 s39, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s38, -1
-; GFX7LESS-NEXT:    s_add_u32 s8, s4, 44
-; GFX7LESS-NEXT:    s_addc_u32 s9, s5, 0
-; GFX7LESS-NEXT:    s_getpc_b64 s[4:5]
-; GFX7LESS-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX7LESS-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX7LESS-NEXT:    s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT:    s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT:    s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX7LESS-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX7LESS-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7LESS-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX7LESS-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7LESS-NEXT:    s_mov_b32 s12, s6
+; GFX7LESS-NEXT:    s_mov_b32 s13, s7
 ; GFX7LESS-NEXT:    s_mov_b64 s[0:1], s[40:41]
 ; GFX7LESS-NEXT:    s_mov_b64 s[2:3], s[42:43]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4983,25 +4938,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s14, s8
 ; GFX9-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s14, s10
-; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    s_mov_b32 s13, s7
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5053,25 +5007,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-NEXT:    s_mov_b32 s12, s8
+; GFX1064-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-NEXT:    s_mov_b32 s14, s10
-; GFX1064-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT:    s_mov_b32 s12, s6
+; GFX1064-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5123,25 +5076,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-NEXT:    s_mov_b32 s12, s8
+; GFX1032-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-NEXT:    s_mov_b32 s14, s10
-; GFX1032-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT:    s_mov_b32 s12, s6
+; GFX1032-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5185,23 +5137,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-NEXT:    s_mov_b32 s12, s8
+; GFX1164-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-NEXT:    s_mov_b32 s14, s10
-; GFX1164-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s12, s6
+; GFX1164-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
@@ -5250,23 +5201,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-NEXT:    s_mov_b32 s12, s13
-; GFX1132-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1132-NEXT:  .LBB8_1: ; %ComputeLoop
@@ -5315,25 +5265,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX9-DPP-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s11
+; GFX9-DPP-NEXT:    s_add_u32 s36, s36, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX9-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX9-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX9-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX9-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX9-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX9-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX9-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX9-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
-; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; GFX9-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX9-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
+; GFX9-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
 ; GFX9-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
 ; GFX9-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX9-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX9-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX9-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5400,25 +5349,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s39, 0x31e16000
-; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1064-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1064-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1064-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1064-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1064-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1064-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1064-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1064-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1064-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1064-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1064-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1064-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1064-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1064-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1064-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5483,25 +5431,24 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1032-DPP-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s38, -1
 ; GFX1032-DPP-NEXT:    s_mov_b32 s39, 0x31c16000
-; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s11
-; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1032-DPP-NEXT:    s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1032-DPP-NEXT:    s_addc_u32 s37, s37, 0
-; GFX1032-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1032-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1032-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1032-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1032-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1032-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1032-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1032-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1032-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1032-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1032-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
 ; GFX1032-DPP-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GFX1032-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1032-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1032-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1032-DPP-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX1032-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5556,23 +5503,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ;
 ; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1164-DPP:       ; %bb.0:
-; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
-; GFX1164-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX1164-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s14, s8
 ; GFX1164-DPP-NEXT:    s_add_u32 s8, s34, 44
-; GFX1164-DPP-NEXT:    s_mov_b32 s13, s9
 ; GFX1164-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1164-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1164-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1164-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1164-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1164-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1164-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v31, v0
-; GFX1164-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1164-DPP-NEXT:    s_mov_b32 s14, s10
-; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1164-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1164-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1164-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1164-DPP-NEXT:    s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT:    s_mov_b32 s13, s7
 ; GFX1164-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1164-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX1164-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1164-DPP-NEXT:    s_mov_b64 exec, s[0:1]
@@ -5639,23 +5585,22 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ;
 ; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_div_value_defalut_scope_strictfp:
 ; GFX1132-DPP:       ; %bb.0:
-; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX1132-DPP-NEXT:    s_mov_b64 s[34:35], s[2:3]
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1132-DPP-NEXT:    s_add_u32 s8, s34, 44
 ; GFX1132-DPP-NEXT:    s_addc_u32 s9, s35, 0
-; GFX1132-DPP-NEXT:    s_getpc_b64 s[4:5]
-; GFX1132-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
-; GFX1132-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX1132-DPP-NEXT:    s_getpc_b64 s[2:3]
+; GFX1132-DPP-NEXT:    s_add_u32 s2, s2, div.float.value at gotpcrel32@lo+4
+; GFX1132-DPP-NEXT:    s_addc_u32 s3, s3, div.float.value at gotpcrel32@hi+12
 ; GFX1132-DPP-NEXT:    s_mov_b32 s12, s13
-; GFX1132-DPP-NEXT:    s_load_b64 s[16:17], s[4:5], 0x0
-; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1132-DPP-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1132-DPP-NEXT:    s_mov_b64 s[10:11], s[4:5]
 ; GFX1132-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX1132-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX1132-DPP-NEXT:    s_mov_b32 s13, s14
 ; GFX1132-DPP-NEXT:    s_mov_b32 s14, s15
 ; GFX1132-DPP-NEXT:    s_mov_b32 s32, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX1132-DPP-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX1132-DPP-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX1132-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX1132-DPP-NEXT:    s_mov_b32 exec_lo, s0
@@ -5720,3 +5665,5 @@ attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-u
 attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #2 = { strictfp}
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}



More information about the llvm-commits mailing list