[llvm] Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226 (PR #81394)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 20 23:08:14 PDT 2024


https://github.com/SahilPatidar updated https://github.com/llvm/llvm-project/pull/81394

>From 7b2bcdb69d8301c020d0789279dde7052a26fdef Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil at 2001gmail.com>
Date: Sat, 10 Feb 2024 17:14:14 +0530
Subject: [PATCH 1/3] amdgpu_gfx functions do not use s0-s3 for inreg SGPR
 arguments on targets using scratch instructions for stack #78226

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 13 +++++++------
 llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td   |  1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 14 +++++++-------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d05c3678bf09e..4087935cc3ad32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -715,10 +715,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!IsEntryFunc && !IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
     TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
-
-    if (!Subtarget.enableFlatScratch())
-      CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   IncomingValueAssigner Assigner(AssignFn);
@@ -732,9 +728,14 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   uint64_t StackSize = Assigner.StackSize;
 
   // Start adding system SGPRs.
-  if (IsEntryFunc)
+  if (IsEntryFunc) {
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
-
+  } else {
+    if (!Subtarget.enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!IsGraphics)
+      TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+  }
   // When we tail call, we need to check if the callee's arguments will fit on
   // the caller's stack. So, whenever we lower formal arguments, we should keep
   // track of this information, since we might lower a tail call in this
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c5207228dc913f..4c922a81c02efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -23,6 +23,7 @@ def CC_SI_Gfx : CallingConv<[
   // 33 is reserved for the frame pointer
   // 34 is reserved for the base pointer
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3,
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..7e15f10bf00a80 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2837,12 +2837,6 @@ SDValue SITargetLowering::LowerFormalArguments(
   } else if (!IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
-
-    // FIXME: Sink this into allocateSpecialInputSGPRs
-    if (!Subtarget->enableFlatScratch())
-      CCInfo.AllocateReg(Info->getScratchRSrcReg());
-
-    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   if (!IsKernel) {
@@ -3046,8 +3040,14 @@ SDValue SITargetLowering::LowerFormalArguments(
   }
 
   // Start adding system SGPRs.
-  if (IsEntryFunc)
+  if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
+  } else {
+    if (!Subtarget->enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!IsGraphics)
+      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+  }
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();

>From 8f604002f24d736a4f0cc8bc3b73c0e5b7f03750 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil at 2001gmail.com>
Date: Tue, 13 Feb 2024 21:40:44 +0530
Subject: [PATCH 2/3] modified test

---
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 761 +++++++++---------
 1 file changed, 402 insertions(+), 359 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd7760..823c5456b23aff 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -11,14 +11,16 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
-; GFX9-O0-NEXT:    s_mov_b32 s34, s4
+; GFX9-O0-NEXT:    s_mov_b32 s35, s3
+; GFX9-O0-NEXT:    s_mov_b32 s40, s2
+; GFX9-O0-NEXT:    s_mov_b32 s36, s1
+; GFX9-O0-NEXT:    s_mov_b32 s34, s0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s41, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s5
+; GFX9-O0-NEXT:    s_mov_b32 s35, s36
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -76,7 +78,11 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; GFX9-O3-NEXT:    s_mov_b32 s36, s0
+; GFX9-O3-NEXT:    s_mov_b32 s37, s1
+; GFX9-O3-NEXT:    s_mov_b32 s38, s2
+; GFX9-O3-NEXT:    s_mov_b32 s39, s3
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
@@ -102,7 +108,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
-; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:4
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -150,14 +156,16 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
-; GFX9-O0-NEXT:    s_mov_b32 s34, s4
+; GFX9-O0-NEXT:    s_mov_b32 s35, s3
+; GFX9-O0-NEXT:    s_mov_b32 s40, s2
+; GFX9-O0-NEXT:    s_mov_b32 s36, s1
+; GFX9-O0-NEXT:    s_mov_b32 s34, s0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s41, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s5
+; GFX9-O0-NEXT:    s_mov_b32 s35, s36
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -266,7 +274,15 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; GFX9-O3-NEXT:    s_mov_b32 s40, s0
+; GFX9-O3-NEXT:    s_mov_b32 s41, s1
+; GFX9-O3-NEXT:    s_mov_b32 s42, s2
+; GFX9-O3-NEXT:    s_mov_b32 s43, s3
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[40:43], 0
+; GFX9-O3-NEXT:    s_mov_b32 s38, s2
+; GFX9-O3-NEXT:    s_mov_b32 s39, s3
+; GFX9-O3-NEXT:    s_mov_b32 s36, s0
+; GFX9-O3-NEXT:    s_mov_b32 s37, s1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
@@ -285,17 +301,17 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-O3-NEXT:  .LBB1_2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[34:35]
@@ -303,7 +319,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
-; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -376,16 +392,18 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s40, s6
-; GFX9-O0-NEXT:    s_mov_b32 s34, s4
+; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-O0-NEXT:    s_mov_b32 s35, s3
+; GFX9-O0-NEXT:    s_mov_b32 s40, s2
+; GFX9-O0-NEXT:    s_mov_b32 s36, s1
+; GFX9-O0-NEXT:    s_mov_b32 s34, s0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s7
+; GFX9-O0-NEXT:    s_mov_b32 s41, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s5
+; GFX9-O0-NEXT:    s_mov_b32 s35, s36
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -393,7 +411,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -427,7 +445,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-LABEL: strict_wwm_call:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_mov_b32 s38, s33
+; GFX9-O3-NEXT:    s_mov_b32 s42, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -436,31 +454,35 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-O3-NEXT:    s_mov_b32 s36, s0
+; GFX9-O3-NEXT:    s_mov_b32 s37, s1
+; GFX9-O3-NEXT:    s_mov_b32 s38, s2
+; GFX9-O3-NEXT:    s_mov_b32 s39, s3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O3-NEXT:    s_getpc_b64 s[40:41]
+; GFX9-O3-NEXT:    s_add_u32 s40, s40, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s41, s41, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[40:41]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT:    s_mov_b32 s33, s38
+; GFX9-O3-NEXT:    s_mov_b32 s33, s42
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -581,17 +603,19 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s34, s8
-; GFX9-O0-NEXT:    s_mov_b32 s38, s6
-; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O0-NEXT:    s_mov_b32 s34, s4
+; GFX9-O0-NEXT:    s_mov_b32 s35, s3
+; GFX9-O0-NEXT:    s_mov_b32 s38, s2
+; GFX9-O0-NEXT:    s_mov_b32 s40, s1
+; GFX9-O0-NEXT:    s_mov_b32 s36, s0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    s_mov_b32 s39, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s39
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s38
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
-; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s37, s40
 ; GFX9-O0-NEXT:    s_mov_b32 s45, s37
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s36
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
@@ -603,7 +627,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s9
+; GFX9-O0-NEXT:    s_mov_b32 s35, s5
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
@@ -686,7 +710,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_mov_b32 s40, s33
+; GFX9-O3-NEXT:    s_mov_b32 s44, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -703,32 +727,36 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
+; GFX9-O3-NEXT:    s_getpc_b64 s[40:41]
+; GFX9-O3-NEXT:    s_add_u32 s40, s40, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s41, s41, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[40:41], s[40:41], 0x0
+; GFX9-O3-NEXT:    s_mov_b32 s36, s0
+; GFX9-O3-NEXT:    s_mov_b32 s37, s1
+; GFX9-O3-NEXT:    s_mov_b32 s38, s2
+; GFX9-O3-NEXT:    s_mov_b32 s39, s3
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[42:43], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[40:41]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[42:43]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -739,7 +767,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    s_mov_b32 s33, s40
+; GFX9-O3-NEXT:    s_mov_b32 s33, s44
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -760,11 +788,14 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_mov_b32 s36, s4
+; GFX9-O0-NEXT:    s_mov_b32 s34, s3
+; GFX9-O0-NEXT:    s_mov_b32 s35, s2
+; GFX9-O0-NEXT:    s_mov_b32 s40, s1
+; GFX9-O0-NEXT:    s_mov_b32 s36, s0
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s37, s5
-; GFX9-O0-NEXT:    s_mov_b32 s38, s6
-; GFX9-O0-NEXT:    s_mov_b32 s39, s7
+; GFX9-O0-NEXT:    s_mov_b32 s37, s40
+; GFX9-O0-NEXT:    s_mov_b32 s38, s35
+; GFX9-O0-NEXT:    s_mov_b32 s39, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
@@ -848,9 +879,13 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT:    s_mov_b32 s36, s0
+; GFX9-O3-NEXT:    s_mov_b32 s37, s1
+; GFX9-O3-NEXT:    s_mov_b32 s38, s2
+; GFX9-O3-NEXT:    s_mov_b32 s39, s3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[36:39], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_mov_b32 s34, -1
 ; GFX9-O3-NEXT:    s_brev_b32 s35, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
@@ -879,8 +914,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[36:39], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -922,270 +957,270 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v47, s64, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s64, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s65, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s66, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s67, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s68, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s69, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s70, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v32, s71, 7
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b32 s34, s3
+; GFX9-O0-NEXT:    s_mov_b32 s35, s2
+; GFX9-O0-NEXT:    s_mov_b32 s36, s1
+; GFX9-O0-NEXT:    s_mov_b32 s37, s0
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s5
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s26
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s29
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
 ; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v42
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v44
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v43
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr42 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v44
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71
+; GFX9-O0-NEXT:    s_mov_b64 s[38:39], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v11
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s39
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s39
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s39
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s39
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s38
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s39
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
@@ -1207,56 +1242,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, s22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, s23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, s24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT:    v_readlane_b32 s67, v47, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s66, v47, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s65, v47, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s64, v47, 0
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, s29
+; GFX9-O0-NEXT:    v_readlane_b32 s71, v32, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s70, v32, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s69, v32, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s68, v32, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s67, v32, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s66, v32, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s65, v32, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s64, v32, 0
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_mov_b64 exec, -1
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1265,30 +1308,26 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_load_dword v26, off, s[0:3], s32
-; GFX9-O3-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:4
-; GFX9-O3-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:8
-; GFX9-O3-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:12
-; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
-; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32
+; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
@@ -1329,46 +1368,50 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
 ; GFX9-O3-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
 ; GFX9-O3-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-O3-NEXT:    v_mov_b32_e32 v11, s15
-; GFX9-O3-NEXT:    v_mov_b32_e32 v12, s16
-; GFX9-O3-NEXT:    v_mov_b32_e32 v13, s17
-; GFX9-O3-NEXT:    v_mov_b32_e32 v14, s18
-; GFX9-O3-NEXT:    v_mov_b32_e32 v15, s19
-; GFX9-O3-NEXT:    v_mov_b32_e32 v16, s20
-; GFX9-O3-NEXT:    v_mov_b32_e32 v17, s21
-; GFX9-O3-NEXT:    v_mov_b32_e32 v18, s22
-; GFX9-O3-NEXT:    v_mov_b32_e32 v19, s23
-; GFX9-O3-NEXT:    v_mov_b32_e32 v20, s24
-; GFX9-O3-NEXT:    v_mov_b32_e32 v21, s25
-; GFX9-O3-NEXT:    v_mov_b32_e32 v22, s26
-; GFX9-O3-NEXT:    v_mov_b32_e32 v23, s27
-; GFX9-O3-NEXT:    v_mov_b32_e32 v24, s28
-; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s29
+; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, s10
+; GFX9-O3-NEXT:    v_mov_b32_e32 v11, s11
+; GFX9-O3-NEXT:    v_mov_b32_e32 v12, s12
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, s13
+; GFX9-O3-NEXT:    v_mov_b32_e32 v14, s14
+; GFX9-O3-NEXT:    v_mov_b32_e32 v15, s15
+; GFX9-O3-NEXT:    v_mov_b32_e32 v16, s16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v17, s17
+; GFX9-O3-NEXT:    v_mov_b32_e32 v18, s18
+; GFX9-O3-NEXT:    v_mov_b32_e32 v19, s19
+; GFX9-O3-NEXT:    v_mov_b32_e32 v20, s20
+; GFX9-O3-NEXT:    v_mov_b32_e32 v21, s21
+; GFX9-O3-NEXT:    v_mov_b32_e32 v22, s22
+; GFX9-O3-NEXT:    v_mov_b32_e32 v23, s23
+; GFX9-O3-NEXT:    v_mov_b32_e32 v24, s24
+; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s25
+; GFX9-O3-NEXT:    v_mov_b32_e32 v26, s26
+; GFX9-O3-NEXT:    v_mov_b32_e32 v27, s27
+; GFX9-O3-NEXT:    v_mov_b32_e32 v28, s28
+; GFX9-O3-NEXT:    v_mov_b32_e32 v29, s29
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]

>From af4f7bb62362fbc3c7a7d4325b75d320ab5635bf Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Wed, 20 Mar 2024 21:25:37 +0530
Subject: [PATCH 3/3] Fixed issue and updated relevant tests

---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   16 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   15 +-
 .../GlobalISel/irtranslator-call-non-fixed.ll |   10 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   10 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |    2 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |   24 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |  133 +
 .../AMDGPU/gfx-callable-argument-types.ll     | 4371 +++++++----------
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |   60 +-
 .../CodeGen/AMDGPU/schedule-addrspaces.ll     |    2 +-
 .../CodeGen/AMDGPU/scratch-pointer-sink.ll    |    4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |  761 ++-
 12 files changed, 2436 insertions(+), 2972 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 4087935cc3ad32..98c1d85c68a495 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -717,6 +717,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
+  if (!IsEntryFunc) {
+    if (!Subtarget.enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!IsGraphics)
+      TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+  }
+
   IncomingValueAssigner Assigner(AssignFn);
   if (!determineAssignments(Assigner, SplitArgs, CCInfo))
     return false;
@@ -728,14 +735,9 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   uint64_t StackSize = Assigner.StackSize;
 
   // Start adding system SGPRs.
-  if (IsEntryFunc) {
+  if (IsEntryFunc)
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
-  } else {
-    if (!Subtarget.enableFlatScratch())
-      CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    if (!IsGraphics)
-      TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
-  }
+
   // When we tail call, we need to check if the callee's arguments will fit on
   // the caller's stack. So, whenever we lower formal arguments, we should keep
   // track of this information, since we might lower a tail call in this
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7e15f10bf00a80..d47bcf5d2262e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2839,6 +2839,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
+  if (!IsEntryFunc) {
+    if (!Subtarget->enableFlatScratch())
+      CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    if (!IsGraphics)
+      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+  }
+
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
@@ -3040,14 +3047,8 @@ SDValue SITargetLowering::LowerFormalArguments(
   }
 
   // Start adding system SGPRs.
-  if (IsEntryFunc) {
+  if (IsEntryFunc)
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
-  } else {
-    if (!Subtarget->enableFlatScratch())
-      CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    if (!IsGraphics)
-      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
-  }
 
   auto &ArgUsageInfo =
     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index 5effd24a752088..fad833c0a6ad5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
-  ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT1]](s32)
+  ; CHECK-NEXT:   $sgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index cb0efc19169dc2..bc3c84fd2359b8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
-  ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT1]](s32)
+  ; CHECK-NEXT:   $sgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c13c4af7b..4d7c04b82ba9d1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX11-LABEL: test_inreg_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf98..e1e3220cc27554 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test34:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_i32 s0, s4, s5
+; GCN-NEXT:    s_min_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_i32 s0, 0x3e9
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test35:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e8
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test36:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_u32 s0, s4, s5
+; GCN-NEXT:    s_min_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_lt_u32 s0, s6
+; GCN-NEXT:    s_cmp_lt_u32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test37:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_ge_i32 s0, s6
+; GCN-NEXT:    s_cmp_ge_i32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test38:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_u32 s0, s4, s5
+; GCN-NEXT:    s_max_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_u32 s0, 0x3e9
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test39:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_i32 s0, s4, s5
+; GCN-NEXT:    s_min_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e7
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test40:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_le_i32 s0, s6
+; GCN-NEXT:    s_cmp_le_i32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test41:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_u32 s0, s4, s5
+; GCN-NEXT:    s_min_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_ge_u32 s0, s6
+; GCN-NEXT:    s_cmp_ge_u32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 44a9127b4bd09c..27845b6b5b2fee 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2176,6 +2176,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr)
 declare void @extern()
 
 define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: void_func_a13i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s27, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[28:29], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off offset:48
+; GFX9-NEXT:    v_mov_b32_e32 v5, s25
+; GFX9-NEXT:    v_mov_b32_e32 v4, s24
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s21
+; GFX9-NEXT:    v_mov_b32_e32 v4, s20
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s17
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, extern at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, extern at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a13i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s23, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s24, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s24
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17
+; GFX11-NEXT:    s_getpc_b64 s[18:19]
+; GFX11-NEXT:    s_add_u32 s18, s18, extern at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s19, s19, extern at gotpcrel32@hi+12
+; GFX11-NEXT:    v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT:    s_load_b64 s[16:17], s[18:19], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s23, 2
+; GFX11-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21
+; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
+; GFX11-NEXT:    v_mov_b32_e32 v10, s0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b32 v[0:1], v14, off offset:48
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off offset:32
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store [13 x i32] %arg0, ptr addrspace(1) %ptr
   call void @extern()
   ret void
@@ -2203,6 +2290,52 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 
 ; FIXME: Should still fail
 define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: void_func_a16i32_inreg__noimplicit:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, s19
+; GFX9-NEXT:    v_mov_b32_e32 v4, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:48
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a16i32_inreg__noimplicit:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14
+; GFX11-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
+; GFX11-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10
+; GFX11-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8
+; GFX11-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX11-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX11-NEXT:    v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2
+; GFX11-NEXT:    v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off offset:48
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:32
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:16
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store [16 x i32] %arg0, ptr addrspace(1) %ptr
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index a118fa388f86d9..3e1db5fb4e1dc8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -9567,19 +9567,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg at abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9597,19 +9595,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg at abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9627,20 +9623,18 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i8_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i8_inreg at abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9658,19 +9652,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i8_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i8_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9692,19 +9684,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg at abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9722,19 +9712,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg at abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9752,20 +9740,18 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i16_inreg at abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9783,19 +9769,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9817,19 +9801,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 42
+; GFX9-NEXT:    s_mov_b32 s0, 42
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9847,19 +9829,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 42
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9877,20 +9857,18 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 42
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 42
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9908,19 +9886,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 42
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9942,22 +9918,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg at abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
-; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9975,22 +9947,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg at abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10008,23 +9976,19 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i64_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i64_inreg at abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10042,22 +10006,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10079,26 +10039,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10116,26 +10073,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10153,27 +10107,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10191,26 +10137,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10233,28 +10171,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10272,28 +10202,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10311,29 +10233,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10351,28 +10265,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10394,32 +10300,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s8, 1
-; GFX9-NEXT:    s_mov_b32 s9, 2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10437,32 +10340,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10480,33 +10380,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s5, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10524,32 +10416,24 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10574,38 +10458,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s8, 1
-; GFX9-NEXT:    s_mov_b32 s9, 2
-; GFX9-NEXT:    s_mov_b32 s10, 3
-; GFX9-NEXT:    s_mov_b32 s11, 4
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10623,38 +10504,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    s_mov_b32 s10, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_mov_b32 s11, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10672,39 +10550,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX11-NEXT:    s_mov_b32 s6, 3
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    s_mov_b32 s10, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX11-NEXT:    s_mov_b32 s11, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s7, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10722,38 +10592,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10777,19 +10639,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg at abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x4400
+; GFX9-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10807,19 +10667,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg at abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x4400
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10837,20 +10695,18 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f16_inreg at abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x4400
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10868,19 +10724,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10902,19 +10756,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 4.0
+; GFX9-NEXT:    s_mov_b32 s0, 4.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10932,19 +10784,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10962,20 +10812,18 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10993,19 +10841,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11027,22 +10873,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11060,22 +10902,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11093,23 +10931,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2f32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2f32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11127,22 +10961,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11164,25 +10994,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 4.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11200,25 +11024,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11236,26 +11054,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11273,25 +11085,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11313,31 +11119,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 4.0
-; GFX9-NEXT:    s_mov_b32 s7, -1.0
-; GFX9-NEXT:    s_mov_b32 s8, 0.5
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 4.0
+; GFX9-NEXT:    s_mov_b32 s3, -1.0
+; GFX9-NEXT:    s_mov_b32 s4, 0.5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11355,31 +11153,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 0.5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    s_mov_b32 s2, 4.0
+; GFX10-NEXT:    s_mov_b32 s3, -1.0
+; GFX10-NEXT:    s_mov_b32 s4, 0.5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11397,32 +11187,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, -1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 0.5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s2, 4.0
+; GFX11-NEXT:    s_mov_b32 s3, -1.0
+; GFX11-NEXT:    s_mov_b32 s4, 0.5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11440,31 +11222,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0.5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 4.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, -1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0.5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11486,22 +11260,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40100000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11519,22 +11289,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11552,23 +11318,19 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f64_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f64_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11586,22 +11348,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11623,28 +11381,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11662,28 +11412,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11701,29 +11443,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11741,28 +11475,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11784,34 +11510,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11829,34 +11547,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11874,35 +11584,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11920,34 +11622,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11969,19 +11663,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11999,19 +11691,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12029,20 +11719,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i16_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12060,19 +11748,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i16_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12095,21 +11781,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v3i16_inreg at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v3i16_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12127,21 +11812,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v3i16_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v3i16_inreg at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12159,22 +11843,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12192,21 +11872,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12229,21 +11905,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v3f16_inreg at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v3f16_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12261,21 +11936,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v3f16_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v3f16_inreg at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12293,22 +11967,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12326,21 +11996,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12363,22 +12029,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x20001
-; GFX9-NEXT:    s_mov_b32 s5, 3
+; GFX9-NEXT:    s_mov_b32 s0, 0x20001
+; GFX9-NEXT:    s_mov_b32 s1, 3
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12396,22 +12058,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-NEXT:    s_mov_b32 s1, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12429,23 +12087,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x20001
+; GFX11-NEXT:    s_mov_b32 s1, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12463,22 +12117,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12500,22 +12150,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX9-NEXT:    s_movk_i32 s5, 0x4400
+; GFX9-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX9-NEXT:    s_movk_i32 s1, 0x4400
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12533,22 +12179,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX10-NEXT:    s_movk_i32 s1, 0x4400
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_movk_i32 s5, 0x4400
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12566,23 +12208,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX11-NEXT:    s_movk_i32 s1, 0x4400
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_movk_i32 s5, 0x4400
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12600,22 +12238,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s1, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12637,21 +12271,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v4i16_inreg at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v4i16_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12669,21 +12302,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v4i16_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v4i16_inreg at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12701,22 +12333,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12734,21 +12362,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12771,22 +12395,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x20001
-; GFX9-NEXT:    s_mov_b32 s5, 0x40003
+; GFX9-NEXT:    s_mov_b32 s0, 0x20001
+; GFX9-NEXT:    s_mov_b32 s1, 0x40003
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12804,22 +12424,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-NEXT:    s_mov_b32 s1, 0x40003
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0x40003
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12837,23 +12453,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x20001
+; GFX11-NEXT:    s_mov_b32 s1, 0x40003
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0x40003
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12871,22 +12483,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0x40003
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12908,19 +12516,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12938,19 +12544,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12968,20 +12572,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2f16_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2f16_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12999,19 +12601,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2f16_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2f16_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13034,21 +12634,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v2i32_inreg at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v2i32_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13066,21 +12665,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v2i32_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v2i32_inreg at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13098,22 +12696,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13131,21 +12725,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13168,22 +12758,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13201,22 +12787,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13234,23 +12816,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13268,22 +12846,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13305,25 +12879,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 3
-; GFX9-NEXT:    s_mov_b32 s5, 4
-; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    s_mov_b32 s0, 3
+; GFX9-NEXT:    s_mov_b32 s1, 4
+; GFX9-NEXT:    s_mov_b32 s2, 5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13341,25 +12909,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 3
+; GFX10-NEXT:    s_mov_b32 s1, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 5
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13377,26 +12939,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 5
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13414,25 +12970,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 5
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13454,28 +13004,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 3
-; GFX9-NEXT:    s_mov_b32 s5, 4
-; GFX9-NEXT:    s_mov_b32 s6, 5
-; GFX9-NEXT:    s_mov_b32 s7, 6
+; GFX9-NEXT:    s_mov_b32 s0, 3
+; GFX9-NEXT:    s_mov_b32 s1, 4
+; GFX9-NEXT:    s_mov_b32 s2, 5
+; GFX9-NEXT:    s_mov_b32 s3, 6
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13493,28 +13035,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 3
+; GFX10-NEXT:    s_mov_b32 s1, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 5
+; GFX10-NEXT:    s_mov_b32 s3, 6
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13532,29 +13066,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 5
+; GFX11-NEXT:    s_mov_b32 s3, 6
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13572,28 +13098,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 5
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 6
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13615,25 +13133,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13651,25 +13166,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13687,26 +13199,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13724,25 +13228,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13765,28 +13261,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13804,28 +13292,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13843,29 +13323,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13883,28 +13355,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13926,31 +13390,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
+; GFX9-NEXT:    s_mov_b32 s4, 5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13965,34 +13421,26 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
+; GFX10-NEXT:    s_mov_b32 s4, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14010,32 +13458,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
+; GFX11-NEXT:    s_mov_b32 s4, 5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14053,31 +13493,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14099,35 +13531,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    s_load_dwordx8 s[36:43], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14145,35 +13578,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[36:43], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14191,36 +13625,28 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14238,35 +13664,27 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14290,40 +13708,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    s_mov_b32 s8, 5
-; GFX9-NEXT:    s_mov_b32 s9, 6
-; GFX9-NEXT:    s_mov_b32 s10, 7
-; GFX9-NEXT:    s_mov_b32 s11, 8
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
+; GFX9-NEXT:    s_mov_b32 s4, 5
+; GFX9-NEXT:    s_mov_b32 s5, 6
+; GFX9-NEXT:    s_mov_b32 s6, 7
+; GFX9-NEXT:    s_mov_b32 s7, 8
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14341,40 +13751,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
+; GFX10-NEXT:    s_mov_b32 s4, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s5, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s6, 7
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    s_mov_b32 s10, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_mov_b32 s11, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_mov_b32 s7, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14392,41 +13794,33 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
+; GFX11-NEXT:    s_mov_b32 s4, 5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
+; GFX11-NEXT:    s_mov_b32 s5, 6
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
+; GFX11-NEXT:    s_mov_b32 s6, 7
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    s_mov_b32 s10, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX11-NEXT:    s_mov_b32 s11, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s7, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14444,40 +13838,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 7
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14499,38 +13885,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 12
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 13
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14543,7 +13938,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 14
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14561,38 +13956,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 14
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
 ; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
 ; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 13
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14605,7 +14009,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 14
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14623,8 +14027,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
@@ -14638,24 +14044,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX11-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX11-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 13
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX11-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX11-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX11-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14668,7 +14064,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 14
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14686,8 +14082,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 14
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
@@ -14701,23 +14099,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg at abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 13
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14730,7 +14118,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 14
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14771,49 +14159,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
+; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s53, external_void_func_v32i32_inreg at abs32@hi
+; GFX9-NEXT:    s_mov_b32 s52, external_void_func_v32i32_inreg at abs32@lo
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s46
-; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX9-NEXT:    v_mov_b32_e32 v1, s47
-; GFX9-NEXT:    v_mov_b32_e32 v2, s48
-; GFX9-NEXT:    v_mov_b32_e32 v3, s49
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
+; GFX9-NEXT:    v_mov_b32_e32 v1, s31
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s50
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s51
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s20, s36
-; GFX9-NEXT:    s_mov_b32 s21, s37
-; GFX9-NEXT:    s_mov_b32 s22, s38
-; GFX9-NEXT:    s_mov_b32 s23, s39
-; GFX9-NEXT:    s_mov_b32 s24, s40
-; GFX9-NEXT:    s_mov_b32 s25, s41
-; GFX9-NEXT:    s_mov_b32 s26, s42
-; GFX9-NEXT:    s_mov_b32 s27, s43
-; GFX9-NEXT:    s_mov_b32 s28, s44
-; GFX9-NEXT:    s_mov_b32 s29, s45
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
@@ -14879,47 +14265,46 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-NEXT:    v_mov_b32_e32 v2, s48
-; GFX10-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-NEXT:    s_mov_b32 s21, s37
-; GFX10-NEXT:    s_mov_b32 s22, s38
-; GFX10-NEXT:    s_mov_b32 s23, s39
-; GFX10-NEXT:    s_mov_b32 s24, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-NEXT:    s_mov_b32 s25, s41
-; GFX10-NEXT:    v_mov_b32_e32 v4, s50
-; GFX10-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
 ; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
@@ -14970,8 +14355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -14988,42 +14373,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51
 ; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47
-; GFX11-NEXT:    v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX11-NEXT:    s_mov_b32 s20, s36
-; GFX11-NEXT:    s_mov_b32 s21, s37
-; GFX11-NEXT:    s_mov_b32 s22, s38
-; GFX11-NEXT:    s_mov_b32 s23, s39
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX11-NEXT:    s_mov_b32 s24, s40
-; GFX11-NEXT:    s_mov_b32 s25, s41
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX11-NEXT:    s_mov_b32 s26, s42
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX11-NEXT:    s_mov_b32 s28, s44
 ; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX11-NEXT:    s_mov_b32 s29, s45
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b512 s[16:31], s[0:1], 0x40
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15071,9 +14440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15090,44 +14458,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15196,55 +14549,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s52
 ; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX9-NEXT:    v_mov_b32_e32 v1, s47
-; GFX9-NEXT:    v_mov_b32_e32 v2, s48
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s49
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s50
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s51
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
-; GFX9-NEXT:    s_mov_b32 s20, s36
-; GFX9-NEXT:    s_mov_b32 s21, s37
-; GFX9-NEXT:    s_mov_b32 s22, s38
-; GFX9-NEXT:    s_mov_b32 s23, s39
-; GFX9-NEXT:    s_mov_b32 s24, s40
-; GFX9-NEXT:    s_mov_b32 s25, s41
-; GFX9-NEXT:    s_mov_b32 s26, s42
-; GFX9-NEXT:    s_mov_b32 s27, s43
-; GFX9-NEXT:    s_mov_b32 s28, s44
-; GFX9-NEXT:    s_mov_b32 s29, s45
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s52
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
+; GFX9-NEXT:    s_mov_b32 s53, external_void_func_v32i32_i32_inreg at abs32@hi
+; GFX9-NEXT:    v_mov_b32_e32 v1, s31
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_mov_b32 s52, external_void_func_v32i32_i32_inreg at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15310,52 +14661,51 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s52
-; GFX10-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-NEXT:    v_mov_b32_e32 v2, s48
-; GFX10-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-NEXT:    s_mov_b32 s20, s36
-; GFX10-NEXT:    s_mov_b32 s21, s37
-; GFX10-NEXT:    s_mov_b32 s22, s38
-; GFX10-NEXT:    s_mov_b32 s23, s39
 ; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-NEXT:    s_mov_b32 s24, s40
-; GFX10-NEXT:    s_mov_b32 s25, s41
-; GFX10-NEXT:    v_mov_b32_e32 v4, s50
-; GFX10-NEXT:    v_mov_b32_e32 v5, s51
 ; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
 ; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s52
+; GFX10-NEXT:    v_mov_b32_e32 v1, s30
+; GFX10-NEXT:    v_mov_b32_e32 v2, s31
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
@@ -15406,8 +14756,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s3, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
+; GFX11-NEXT:    s_add_i32 s36, s32, 8
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15424,46 +14774,30 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51
 ; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47
-; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX11-NEXT:    v_mov_b32_e32 v2, s48
-; GFX11-NEXT:    s_add_i32 s2, s32, 24
-; GFX11-NEXT:    s_mov_b32 s20, s36
-; GFX11-NEXT:    s_mov_b32 s21, s37
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX11-NEXT:    s_mov_b32 s22, s38
-; GFX11-NEXT:    s_mov_b32 s23, s39
-; GFX11-NEXT:    s_mov_b32 s24, s40
-; GFX11-NEXT:    s_mov_b32 s25, s41
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX11-NEXT:    s_mov_b32 s26, s42
-; GFX11-NEXT:    scratch_store_b32 off, v6, s2
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX11-NEXT:    s_mov_b32 s28, s44
 ; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX11-NEXT:    s_mov_b32 s29, s45
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s34, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b512 s[16:31], s[0:1], 0x40
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v1, s31
+; GFX11-NEXT:    v_mov_b32_e32 v0, s30
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
+; GFX11-NEXT:    scratch_store_b32 off, v2, s36
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15511,13 +14845,17 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_load_dword s36, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s36
+; GFX10-SCRATCH-NEXT:    s_add_i32 s36, s32, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
@@ -15530,50 +14868,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_clause 0x2
-; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    ; meta instruction
-; GFX10-SCRATCH-NEXT:    ; meta instruction
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg at abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s36
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
@@ -17397,6 +16714,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17423,6 +16741,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16 at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17442,18 +16761,18 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17469,19 +16788,19 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17511,6 +16830,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17537,6 +16857,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16 at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17556,18 +16877,18 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v1bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v1bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17583,19 +16904,19 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v1bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v1bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17625,6 +16946,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17651,6 +16973,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16 at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17670,18 +16993,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17697,19 +17020,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17739,6 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17765,8 +17090,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16 at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -17784,18 +17111,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17811,19 +17138,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17853,6 +17180,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17879,8 +17208,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16 at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -17898,18 +17229,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17925,19 +17256,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17967,6 +17298,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s3, s7
+; GFX9-NEXT:    s_mov_b32 s2, s6
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17993,8 +17328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s3, s7
+; GFX10-NEXT:    s_mov_b32 s2, s6
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -18012,18 +17351,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s35
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -18039,19 +17378,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -18077,16 +17416,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX9-NEXT:    s_mov_b32 s3, s7
+; GFX9-NEXT:    s_mov_b32 s2, s6
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, s8
+; GFX9-NEXT:    s_mov_b32 s5, s9
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -18104,16 +17459,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
+; GFX10-NEXT:    s_mov_b32 s3, s7
+; GFX10-NEXT:    s_mov_b32 s2, s6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s4, s8
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, s9
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    s_mov_b32 s6, s10
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    s_mov_b32 s7, s11
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -18126,18 +17497,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s35
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -18153,19 +17524,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16 at abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16 at abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v16bf16 at abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v16bf16 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 408199bbc92237..37975b872800fb 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -847,11 +847,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -885,19 +885,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s61, 29
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_movk_i32 s0, 0x7b
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s63, v40, 31
 ; GCN-NEXT:    v_readlane_b32 s62, v40, 30
 ; GCN-NEXT:    v_readlane_b32 s61, v40, 29
@@ -930,22 +930,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s5, s33
+; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
-; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -979,19 +979,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
+; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GISEL-NEXT:    s_movk_i32 s0, 0x7b
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
 ; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
 ; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
@@ -1024,11 +1024,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s5
+; GISEL-NEXT:    s_mov_b32 s33, s10
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void %fptr(i32 inreg 123)
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
index 0139c52db1d582..27e9d0fce9426d 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
@@ -10,7 +10,7 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3
 ; CHECK-NEXT:    scratch_load_b32 v2, v0, off
 ; CHECK-NEXT:    scratch_load_b32 v3, v3, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[4:7], 0 offen
+; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 
   %x0 = load i32, ptr addrspace(5) %src
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
index cdaac14833e0ef..ec1de0222d5d9b 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
@@ -6,7 +6,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %
 ; GCN-LABEL: sink_scratch_pointer:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb2
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off offset:-4
@@ -21,7 +21,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %
 ; GISEL-LABEL: sink_scratch_pointer:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GISEL-NEXT:  ; %bb.1: ; %bb2
 ; GISEL-NEXT:    scratch_load_b32 v0, v0, off offset:-4
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 823c5456b23aff..e79cb66dcd7760 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -11,16 +11,14 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_mov_b32 s35, s3
-; GFX9-O0-NEXT:    s_mov_b32 s40, s2
-; GFX9-O0-NEXT:    s_mov_b32 s36, s1
-; GFX9-O0-NEXT:    s_mov_b32 s34, s0
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s35
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s36
+; GFX9-O0-NEXT:    s_mov_b32 s35, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -78,11 +76,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_mov_b32 s36, s0
-; GFX9-O3-NEXT:    s_mov_b32 s37, s1
-; GFX9-O3-NEXT:    s_mov_b32 s38, s2
-; GFX9-O3-NEXT:    s_mov_b32 s39, s3
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
@@ -108,7 +102,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
 ; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
-; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -156,16 +150,14 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    s_mov_b32 s35, s3
-; GFX9-O0-NEXT:    s_mov_b32 s40, s2
-; GFX9-O0-NEXT:    s_mov_b32 s36, s1
-; GFX9-O0-NEXT:    s_mov_b32 s34, s0
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s35
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s36
+; GFX9-O0-NEXT:    s_mov_b32 s35, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -274,15 +266,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_mov_b32 s40, s0
-; GFX9-O3-NEXT:    s_mov_b32 s41, s1
-; GFX9-O3-NEXT:    s_mov_b32 s42, s2
-; GFX9-O3-NEXT:    s_mov_b32 s43, s3
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[40:43], 0
-; GFX9-O3-NEXT:    s_mov_b32 s38, s2
-; GFX9-O3-NEXT:    s_mov_b32 s39, s3
-; GFX9-O3-NEXT:    s_mov_b32 s36, s0
-; GFX9-O3-NEXT:    s_mov_b32 s37, s1
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
@@ -301,17 +285,17 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-O3-NEXT:  ; %bb.1: ; %if
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[40:41], -1
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[40:41]
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-O3-NEXT:  .LBB1_2: ; %merge
 ; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[34:35]
@@ -319,7 +303,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
-; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -392,18 +376,16 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s35, s3
-; GFX9-O0-NEXT:    s_mov_b32 s40, s2
-; GFX9-O0-NEXT:    s_mov_b32 s36, s1
-; GFX9-O0-NEXT:    s_mov_b32 s34, s0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O0-NEXT:    s_mov_b32 s40, s6
+; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT:    s_mov_b32 s41, s35
+; GFX9-O0-NEXT:    s_mov_b32 s41, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s42, s41
 ; GFX9-O0-NEXT:    s_mov_b32 s43, s40
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s36
+; GFX9-O0-NEXT:    s_mov_b32 s35, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s35
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s34
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -411,7 +393,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
@@ -445,7 +427,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-LABEL: strict_wwm_call:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_mov_b32 s42, s33
+; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -454,35 +436,31 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-O3-NEXT:    s_mov_b32 s36, s0
-; GFX9-O3-NEXT:    s_mov_b32 s37, s1
-; GFX9-O3-NEXT:    s_mov_b32 s38, s2
-; GFX9-O3-NEXT:    s_mov_b32 s39, s3
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-O3-NEXT:    s_getpc_b64 s[40:41]
-; GFX9-O3-NEXT:    s_add_u32 s40, s40, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s41, s41, strict_wwm_called at rel32@hi+12
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
+; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
-; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT:    s_mov_b32 s33, s42
+; GFX9-O3-NEXT:    s_mov_b32 s33, s38
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -603,19 +581,17 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
-; GFX9-O0-NEXT:    s_mov_b32 s34, s4
-; GFX9-O0-NEXT:    s_mov_b32 s35, s3
-; GFX9-O0-NEXT:    s_mov_b32 s38, s2
-; GFX9-O0-NEXT:    s_mov_b32 s40, s1
-; GFX9-O0-NEXT:    s_mov_b32 s36, s0
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT:    s_mov_b32 s34, s8
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s39, s35
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s39
 ; GFX9-O0-NEXT:    s_mov_b32 s44, s38
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
-; GFX9-O0-NEXT:    s_mov_b32 s37, s40
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s45, s37
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s36
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
@@ -627,7 +603,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s42, 2
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s43, 3
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT:    s_mov_b32 s35, s5
+; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
@@ -710,7 +686,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT:    s_mov_b32 s44, s33
+; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -727,36 +703,32 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    s_getpc_b64 s[40:41]
-; GFX9-O3-NEXT:    s_add_u32 s40, s40, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT:    s_addc_u32 s41, s41, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT:    s_load_dwordx2 s[40:41], s[40:41], 0x0
-; GFX9-O3-NEXT:    s_mov_b32 s36, s0
-; GFX9-O3-NEXT:    s_mov_b32 s37, s1
-; GFX9-O3-NEXT:    s_mov_b32 s38, s2
-; GFX9-O3-NEXT:    s_mov_b32 s39, s3
+; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
+; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    s_or_saveexec_b64 s[42:43], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[40:41]
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX9-O3-NEXT:    s_mov_b64 exec, s[42:43]
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -767,7 +739,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    s_mov_b32 s33, s44
+; GFX9-O3-NEXT:    s_mov_b32 s33, s40
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -788,14 +760,11 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_mov_b32 s34, s3
-; GFX9-O0-NEXT:    s_mov_b32 s35, s2
-; GFX9-O0-NEXT:    s_mov_b32 s40, s1
-; GFX9-O0-NEXT:    s_mov_b32 s36, s0
+; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT:    s_mov_b32 s37, s40
-; GFX9-O0-NEXT:    s_mov_b32 s38, s35
-; GFX9-O0-NEXT:    s_mov_b32 s39, s34
+; GFX9-O0-NEXT:    s_mov_b32 s37, s5
+; GFX9-O0-NEXT:    s_mov_b32 s38, s6
+; GFX9-O0-NEXT:    s_mov_b32 s39, s7
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 5
 ; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s34, v0
@@ -879,13 +848,9 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_mov_b32 s36, s0
-; GFX9-O3-NEXT:    s_mov_b32 s37, s1
-; GFX9-O3-NEXT:    s_mov_b32 s38, s2
-; GFX9-O3-NEXT:    s_mov_b32 s39, s3
 ; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[36:39], 0 offen
-; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_mov_b32 s34, -1
 ; GFX9-O3-NEXT:    s_brev_b32 s35, -2
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
@@ -914,8 +879,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[36:39], 0 offen
-; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -957,270 +922,270 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s64, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s65, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s66, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s67, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s68, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s69, 5
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s70, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s71, 7
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b32 s34, s3
-; GFX9-O0-NEXT:    s_mov_b32 s35, s2
-; GFX9-O0-NEXT:    s_mov_b32 s36, s1
-; GFX9-O0-NEXT:    s_mov_b32 s37, s0
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s64, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s37
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s35
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s26
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s29
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v46
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v45
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v44
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v43
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr38_sgpr39
-; GFX9-O0-NEXT:    ; implicit-def: $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71
-; GFX9-O0-NEXT:    s_mov_b64 s[38:39], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v11
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s38
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s39
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
@@ -1242,64 +1207,56 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s37
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s35
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s11
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s12
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s13
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s14
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s15
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s17
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, s18
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, s19
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, s20
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s21
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, s22
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s23
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, s29
-; GFX9-O0-NEXT:    v_readlane_b32 s71, v32, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s70, v32, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s69, v32, 5
-; GFX9-O0-NEXT:    v_readlane_b32 s68, v32, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s67, v32, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s66, v32, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s65, v32, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s64, v32, 0
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
+; GFX9-O0-NEXT:    v_readlane_b32 s67, v47, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s66, v47, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s65, v47, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s64, v47, 0
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1308,26 +1265,30 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32
-; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; GFX9-O3-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_load_dword v26, off, s[0:3], s32
+; GFX9-O3-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:4
+; GFX9-O3-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:8
+; GFX9-O3-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:12
+; GFX9-O3-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:16
+; GFX9-O3-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:20
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v33, v2
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
@@ -1368,50 +1329,46 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
 ; GFX9-O3-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
 ; GFX9-O3-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s6
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v9, s9
-; GFX9-O3-NEXT:    v_mov_b32_e32 v10, s10
-; GFX9-O3-NEXT:    v_mov_b32_e32 v11, s11
-; GFX9-O3-NEXT:    v_mov_b32_e32 v12, s12
-; GFX9-O3-NEXT:    v_mov_b32_e32 v13, s13
-; GFX9-O3-NEXT:    v_mov_b32_e32 v14, s14
-; GFX9-O3-NEXT:    v_mov_b32_e32 v15, s15
-; GFX9-O3-NEXT:    v_mov_b32_e32 v16, s16
-; GFX9-O3-NEXT:    v_mov_b32_e32 v17, s17
-; GFX9-O3-NEXT:    v_mov_b32_e32 v18, s18
-; GFX9-O3-NEXT:    v_mov_b32_e32 v19, s19
-; GFX9-O3-NEXT:    v_mov_b32_e32 v20, s20
-; GFX9-O3-NEXT:    v_mov_b32_e32 v21, s21
-; GFX9-O3-NEXT:    v_mov_b32_e32 v22, s22
-; GFX9-O3-NEXT:    v_mov_b32_e32 v23, s23
-; GFX9-O3-NEXT:    v_mov_b32_e32 v24, s24
-; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s25
-; GFX9-O3-NEXT:    v_mov_b32_e32 v26, s26
-; GFX9-O3-NEXT:    v_mov_b32_e32 v27, s27
-; GFX9-O3-NEXT:    v_mov_b32_e32 v28, s28
-; GFX9-O3-NEXT:    v_mov_b32_e32 v29, s29
+; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s10
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s11
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s12
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, s13
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, s14
+; GFX9-O3-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-O3-NEXT:    v_mov_b32_e32 v12, s16
+; GFX9-O3-NEXT:    v_mov_b32_e32 v13, s17
+; GFX9-O3-NEXT:    v_mov_b32_e32 v14, s18
+; GFX9-O3-NEXT:    v_mov_b32_e32 v15, s19
+; GFX9-O3-NEXT:    v_mov_b32_e32 v16, s20
+; GFX9-O3-NEXT:    v_mov_b32_e32 v17, s21
+; GFX9-O3-NEXT:    v_mov_b32_e32 v18, s22
+; GFX9-O3-NEXT:    v_mov_b32_e32 v19, s23
+; GFX9-O3-NEXT:    v_mov_b32_e32 v20, s24
+; GFX9-O3-NEXT:    v_mov_b32_e32 v21, s25
+; GFX9-O3-NEXT:    v_mov_b32_e32 v22, s26
+; GFX9-O3-NEXT:    v_mov_b32_e32 v23, s27
+; GFX9-O3-NEXT:    v_mov_b32_e32 v24, s28
+; GFX9-O3-NEXT:    v_mov_b32_e32 v25, s29
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]



More information about the llvm-commits mailing list